diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile
index d9cf8cc771..906153a621 100644
--- a/.circleci/docker/Dockerfile
+++ b/.circleci/docker/Dockerfile
@@ -1,6 +1,6 @@
 ARG PYTORCH="1.8.1"
-ARG CUDA="10.2"
-ARG CUDNN="7"
+ARG CUDA="11.1"
+ARG CUDNN="8"
 
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
diff --git a/.circleci/test.yml b/.circleci/test.yml
index 664a803811..5c57cd74b9 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -64,19 +64,14 @@ jobs:
             pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main
             pip install -U openmim
             mim install 'mmcv >= 2.0.0'
-            pip install git+ssh://git@github.com/open-mmlab/mmdetection.git@dev-3.x
+            pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x
             pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+            pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x
             pip install -r requirements.txt
-      - when:
-          condition:
-            equal: [ "1.13.0", << parameters.torch >> ]
-          steps:
-            - run: pip install timm
-      - when:
-          condition:
-            equal: [ "1.6.0", << parameters.torch >> ]
-          steps:
-            - run: pip install timm==0.6.7
+      - run:
+          name: Install timm
+          command: |
+            pip install timm
       - when:
           condition:
             equal: [ "0.10.0", << parameters.torchvision >> ]
@@ -98,7 +93,7 @@ jobs:
         type: string
       cuda:
         type: enum
-        enum: ["11.0"]
+        enum: ["11.1"]
       cudnn:
         type: integer
         default: 8
@@ -129,6 +124,7 @@ jobs:
             docker exec mmaction pip install -U openmim
             docker exec mmaction mim install 'mmcv >= 2.0.0'
             docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x
+            docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmpose.git@dev-1.x
             docker exec mmaction pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
             docker exec mmaction pip install -r requirements.txt
       - run:
@@ -141,7 +137,6 @@ jobs:
             docker exec mmaction pytest tests/
 workflows:
   pr_stage_lint:
-    when: << pipeline.parameters.lint_only >>
     jobs:
       - lint:
           name: lint
@@ -164,8 +159,8 @@ workflows:
                 - main
       - build_cpu:
           name: minimum_version_cpu
-          torch: 1.6.0
-          torchvision: 0.7.0
+          torch: 1.8.1
+          torchvision: 0.9.1
           python: 3.7.4
           requires:
             - lint
@@ -185,7 +180,7 @@ workflows:
           torch: 1.8.1
           # Use double quotation mark to explicitly specify its type
           # as string instead of number
-          cuda: "11.0"
+          cuda: "11.1"
           requires:
             - hold
   merge_stage_test:
@@ -195,10 +190,10 @@ workflows:
     jobs:
       - build_cuda:
           name: minimum_version_gpu
-          torch: 1.7.1
+          torch: 1.8.1
           # Use double quotation mark to explicitly specify its type
           # as string instead of number
-          cuda: "11.0"
+          cuda: "11.1"
           filters:
             branches:
               only:
diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml
index 60df0a1245..0b83911506 100644
--- a/.github/workflows/merge_stage_test.yml
+++ b/.github/workflows/merge_stage_test.yml
@@ -60,6 +60,8 @@ jobs:
         run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install MMCls
         run: pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+      - name: Install MMPose
+        run: pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x
       - name: Install PytorchVideo
         run: pip install pytorchvideo
         if: ${{matrix.torchvision == '0.10.0'}}
@@ -78,12 +80,8 @@ jobs:
     strategy:
       matrix:
         python-version: [3.7]
-        torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1]
+        torch: [1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1]
         include:
-          - torch: 1.6.0
-            torchvision: 0.7.0
-          - torch: 1.7.1
-            torchvision: 0.8.2
           - torch: 1.8.1
             torchvision: 0.9.1
           - torch: 1.9.1
@@ -108,12 +106,8 @@ jobs:
         run: pip install librosa soundfile
       - name: Install lmdb
         run: pip install lmdb
-      - name: Install timm
-        run: pip install timm==0.6.7
-        if: ${{matrix.torch == '1.6.0'}}
       - name: Install timm
         run: pip install timm
-        if: ${{matrix.torch != '1.6.0'}}
       - name: Install TurboJpeg lib
         run: sudo apt-get install -y libturbojpeg
       - name: Install PyTorch
@@ -128,6 +122,8 @@ jobs:
         run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install MMCls
         run: pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+      - name: Install MMPose
+        run: pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x
       - name: Install unittest dependencies
         run: pip install -r requirements.txt
       - name: Install PytorchVideo
@@ -190,6 +186,7 @@ jobs:
           mim install 'mmcv >= 2.0.0'
           pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x
           pip install -r requirements.txt
       - name: Install PytorchVideo
         run: pip install pytorchvideo
@@ -214,7 +211,10 @@ jobs:
         with:
           python-version: ${{ matrix.python }}
       - name: Upgrade pip
-        run: pip install pip --upgrade
+        run: |
+            python -V
+            python -m pip install pip --upgrade
+            python -m pip install wheel
       - name: Install librosa and soundfile
         run: python -m pip install librosa soundfile
       - name: Install lmdb
@@ -228,6 +228,7 @@ jobs:
           mim install 'mmcv >= 2.0.0'
           pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x
           pip install -r requirements.txt
       - name: Install PytorchVideo
         run: python -m pip install pytorchvideo
@@ -235,7 +236,7 @@ jobs:
         run: python -m pip install timm
       - name: Build and install
         run: |
-          pip install -e .
+          pip install -e . -v
       - name: Run unittests and generate coverage report
         run: |
           pytest tests/
diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml
index 401dfdd7de..2513d38596 100644
--- a/.github/workflows/pr_stage_test.yml
+++ b/.github/workflows/pr_stage_test.yml
@@ -30,7 +30,7 @@ jobs:
         with:
            python-version: ${{ matrix.python-version }}
       - name: Upgrade pip
-        run: pip install pip --upgrade  && pip install wheel
+        run: pip install pip --upgrade && pip install wheel
       - name: Install soundfile lib
         run: sudo apt-get install -y libsndfile1
       - name: Install librosa and soundfile
@@ -51,6 +51,8 @@ jobs:
         run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install MMCls
         run: pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+      - name: Install MMPose
+        run: pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x
       - name: Install unittest dependencies
         run: pip install -r requirements.txt
       - name: Install PytorchVideo
@@ -95,7 +97,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
       - name: Upgrade pip
-        run: pip install pip --upgrade  && pip install wheel
+        run: pip install pip --upgrade && pip install wheel
       - name: Fetch GPG keys
         run: |
           apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
@@ -117,6 +119,7 @@ jobs:
           mim install 'mmcv >= 2.0.0'
           pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x
           pip install -r requirements.txt
       - name: Install PytorchVideo
         run: pip install pytorchvideo
@@ -165,6 +168,7 @@ jobs:
           mim install 'mmcv >= 2.0.0'
           pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmpose.git@dev-1.x
           pip install -r requirements.txt
       - name: Install PytorchVideo
         run: python -m pip install pytorchvideo
diff --git a/.gitignore b/.gitignore
index 1d637fa156..d3f3b2b874 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,8 +64,16 @@ instance/
 # Scrapy stuff:
 .scrapy
 
-# Sphinx documentation
+# Auto generate documentation
 docs/*/_build/
+docs/*/model_zoo/
+docs/*/dataset_zoo/
+docs/*/_model_zoo.rst
+docs/*/modelzoo_statistics.md
+docs/*/datasetzoo_statistics.md
+docs/*/projectzoo.md
+docs/*/papers/
+docs/*/api/generated/
 
 # PyBuilder
 target/
@@ -133,11 +141,11 @@ work_dirs/
 !tests/data/**/*.pth
 
 # avoid soft links created by MIM
-mmaction/configs/*
 mmaction/tools/*
 
 *.ipynb
 
 # unignore ipython notebook files in demo
 !demo/*.ipynb
+!projects/stad_tutorial/*.ipynb
 mmaction/.mim
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9583935859..a5cb6ca522 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,7 +34,7 @@ repos:
     rev: v2.1.0
     hooks:
       - id: codespell
-        args: ["--skip", "*.ipynb", "-L", "Gool,tread,gool,mot"]
+        args: ["--skip", "*.ipynb", "-L", "ECT,Gool,tread,gool,mot"]
   - repo: https://github.com/executablebooks/mdformat
     rev: 0.7.14
     hooks:
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 65a65ba1ba..73e5c931b2 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -3,7 +3,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.7"
+    python: "3.9"
 
 formats:
     - epub
diff --git a/MANIFEST.in b/MANIFEST.in
index 258c4e016b..ccb77d0945 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
 include mmaction/.mim/model-index.yml
+include mmaction/.mim/dataset-index.yml
 recursive-include mmaction/.mim/configs *.py *.yml
 recursive-include mmaction/.mim/tools *.sh *.py
diff --git a/README.md b/README.md
index 75597cc887..f9997f54f7 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@
 
 [📘Documentation](https://mmaction2.readthedocs.io/en/latest/) |
 [🛠️Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) |
-[👀Model Zoo](https://mmaction2.readthedocs.io/en/latest/model_zoo/modelzoo.html) |
+[👀Model Zoo](https://mmaction2.readthedocs.io/en/latest/modelzoo_statistics.html) |
 [🆕Update News](https://mmaction2.readthedocs.io/en/latest/notes/changelog.html) |
 [🚀Ongoing Projects](https://github.com/open-mmlab/mmaction2/projects) |
 [🤔Reporting Issues](https://github.com/open-mmlab/mmaction2/issues/new/choose)
@@ -75,12 +75,14 @@ English | [简体中文](/README_zh-CN.md)
 
 **The default branch has been switched to `main`(previous `1.x`) from `master`(current `0.x`), and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/latest/migration.html) for more details.**
 
-**Release (2023.04.06)**: v1.0.0 with the following new features:
+**Release (2023.07.04)**: v1.1.0 with the following new features:
 
-- Support RGB-PoseC3D(CVPR'2022).
-- Support training UniFormer V2(Arxiv'2022).
-- Support MSG3D(CVPR'2020) and CTRGCN(CVPR'2021) in projects.
-- Refactor and provide more user-friendly documentation.
+- Support CLIP-based multi-modality models: ActionCLIP(Arxiv'2021) and CLIP4clip(ArXiv'2022)
+- Support rich projects: gesture recognition, spatio-temporal action detection tutorial, and knowledge distillation
+- Support HACS-segments dataset(ICCV'2019), MultiSports dataset(ICCV'2021), Kinetics-710 dataset(Arxiv'2022)
+- Support VideoMAE V2(CVPR'2023), and VideoMAE(NeurIPS'2022) on action detection
+- Support TCANet(CVPR'2021)
+- Support [Pure Python style Configuration File](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) and downloading datasets by MIM with one command
 
 ## 📖 Introduction [🔝](#-table-of-contents)
 
@@ -106,7 +108,7 @@ It is a part of the [OpenMMLab](http://openmmlab.com/) project.
 
 - **Modular design**: We decompose a video understanding framework into different components. One can easily construct a customized video understanding framework by combining different modules.
 
-- **Support four major video understanding tasks**: MMAction2 implements various algorithms for multiple video understanding tasks, including action recognition, action localization, spatio-temporal action detection, and skeleton-based action detection.
+- **Support five major video understanding tasks**: MMAction2 implements various algorithms for multiple video understanding tasks, including action recognition, action localization, spatio-temporal action detection, skeleton-based action detection and video retrieval.
 
 - **Well tested and documented**: We provide detailed documentation and API reference, as well as unit tests.
 
@@ -172,15 +174,15 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition_audio/resnet/README.md">MultiModality: Audio</a> (ArXiv'2020)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tanet/README.md">TANet</a> (ArXiv'2020)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/timesformer/README.md">TimeSformer</a> (ICML'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/projects/actionclip/README.md">ActionCLIP</a> (ArXiv'2021)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/README.md">VideoSwin</a> (CVPR'2022)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/README.md">VideoMAE</a> (NeurIPS'2022)</td>
   </tr>
   <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/README.md">VideoMAE</a> (NeurIPS'2022)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/mvit/README.md">MViT V2</a> (CVPR'2022)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformer/README.md">UniFormer V1</a> (ICLR'2022)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformerv2/README.md">UniFormer V2</a> (Arxiv'2022)</td>
-    <td></td>
-    <td></td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomaev2/README.md">VideoMAE V2</a> (CVPR'2023)</td>
   </tr>
   <tr>
     <td colspan="5" style="font-weight:bold;">Action Localization</td>
@@ -188,6 +190,7 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
   <tr>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/bsn/README.md">BSN</a> (ECCV'2018)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/bmn/README.md">BMN</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/tcanet/README.md">TCANet</a> (CVPR'2021)</td>
     <td></td>
     <td></td>
   </tr>
@@ -199,7 +202,7 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/slowonly/README.md">SlowOnly+Fast R-CNN</a> (ICCV'2019)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/slowfast/README.md">SlowFast+Fast R-CNN</a> (ICCV'2019)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/lfb/README.md">LFB</a> (CVPR'2019)</td>
-    <td></td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/README.md">VideoMAE</a> (NeurIPS'2022)</td>
   </tr>
   <tr>
     <td colspan="5" style="font-weight:bold;">Skeleton-based Action Recognition</td>
@@ -214,7 +217,21 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
   <tr>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/projects/msg3d/README.md">MSG3D</a> (CVPR'2020)</td>
     <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">Video Retrieval</td>
   </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/retrieval/clip4clip/README.md">CLIP4Clip</a> (ArXiv'2022)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+
 </table>
 
 </details>
@@ -247,7 +264,7 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
   </tr>
   <tr>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/gym/README.md">FineGYM</a> (<a href="https://sdolivia.github.io/FineGym/">Homepage</a>) (CVPR'2020)</td>
-    <td></td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics710/README.md">Kinetics-710</a> (<a href="https://arxiv.org/pdf/2211.09552.pdf">Homepage</a>) (Arxiv'2022)</td>
     <td></td>
     <td></td>
   </tr>
@@ -257,7 +274,7 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
   <tr>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/thumos14/README.md">THUMOS14</a> (<a href="https://www.crcv.ucf.edu/THUMOS14/download.html">Homepage</a>) (THUMOS Challenge 2014)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/activitynet/README.md">ActivityNet</a> (<a href="http://activity-net.org/">Homepage</a>) (CVPR'2015)</td>
-    <td></td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/hacs/README.md">HACS</a> (<a href="https://github.com/hangzhaomit/HACS-dataset">Homepage</a>) (ICCV'2019)</td>
     <td></td>
   </tr>
   <tr>
@@ -269,6 +286,12 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ava/README.md">AVA</a> (<a href="https://research.google.com/ava/index.html">Homepage</a>) (CVPR'2018)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ava_kinetics/README.md">AVA-Kinetics</a> (<a href="https://research.google.com/ava/index.html">Homepage</a>) (Arxiv'2020)</td>
   </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/multisports/README.md">MultiSports</a> (<a href="https://deeperaction.github.io/datasets/multisports.html">Homepage</a>) (ICCV'2021)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
   <tr>
     <td colspan="4" style="font-weight:bold;">Skeleton-based Action Recognition</td>
   </tr>
@@ -278,6 +301,16 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-UCF101</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-HMDB51</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
   </tr>
+  <tr>
+    <td colspan="4" style="font-weight:bold;">Video Retrieval</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/video_retrieval/README.md">MSRVTT</a> (<a href="https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/">Homepage</a>) (CVPR'2016)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+
 </table>
 
 </details>
@@ -287,7 +320,7 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
 For tutorials, we provide the following user guides for basic usage:
 
 - [Migration from MMAction2 0.X](https://mmaction2.readthedocs.io/en/latest/migration.html)
-- [Learn about Configs](https://mmaction2.readthedocs.io/en/latest/user_guides/config.html#)
+- [Learn about Configs](https://mmaction2.readthedocs.io/en/latest/user_guides/config.html)
 - [Prepare Datasets](https://mmaction2.readthedocs.io/en/latest/user_guides/prepare_dataset.html)
 - [Inference with Existing Models](https://mmaction2.readthedocs.io/en/latest/user_guides/inference.html)
 - [Training and Testing](https://mmaction2.readthedocs.io/en/latest/user_guides/train_test.html)
@@ -332,20 +365,24 @@ We wish that the toolbox and benchmark could serve the growing research communit
 
 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMEval](https://github.com/open-mmlab/mmeval): A unified evaluation library for multiple machine learning libraries.
 - [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark.
-- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox.
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
 - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
 - [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
 - [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
 - [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
 - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
 - [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
-- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab Model Deployment Framework.
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
-- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
 - [Playground](https://github.com/open-mmlab/playground): A central hub for gathering and showcasing amazing projects built upon OpenMMLab.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 5e866c3402..7cdea2c165 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -19,18 +19,18 @@
 
 [![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/)
 [![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions)
-[![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2)
+[![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2)
 [![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/)
-[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/blob/master/LICENSE)
+[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/blob/main/LICENSE)
 [![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
 [![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
 
-[📘文档](https://mmaction2.readthedocs.io/zh_CN/latest/) |
-[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN/latest/get_started.html) |
-[👀模型库](https://mmaction2.readthedocs.io/zh_CN/latest/modelzoo.html) |
-[🆕更新](https://mmaction2.readthedocs.io/zh_CN/latest/notes/changelog.html) |
+[📘中文文档](https://mmaction2.readthedocs.io/zh_CN/latest/index.html) |
+[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN/get_started/installation.html) |
+[👀模型库](https://mmaction2.readthedocs.io/zh_CN/latest/modelzoo_statistics.html) |
+[🆕更新日志](https://mmaction2.readthedocs.io/en/latest/notes/changelog.html) |
 [🚀进行中项目](https://github.com/open-mmlab/mmaction2/projects) |
-[🤔问题反馈](https://github.com/open-mmlab/mmaction2/issues/new/choose)
+[🤔报告问题](https://github.com/open-mmlab/mmaction2/issues/new/choose)
 
 </div>
 
@@ -56,57 +56,75 @@
 
 [English](/README.md) | 简体中文
 
-## 简介
+## 📄 目录
 
-MMAction2 是一款基于 PyTorch 的视频理解开源工具箱，是 [OpenMMLab](https://openmmlab.com/) 项目的成员之一
+- [📄 目录](#-目录)
+- [🥳 🚀 最新进展](#--最新进展-)
+- [📖 简介](#-简介-)
+- [🎁 主要功能](#-主要功能-)
+- [🛠️ 安装](#️-安装-)
+- [👀 模型库](#-模型库-)
+- [👨‍🏫 新手入门](#-新手入门-)
+- [🎫 许可证](#-许可证-)
+- [🖊️ 引用](#️-引用-)
+- [🙌 参与贡献](#-参与贡献-)
+- [🤝 致谢](#-致谢-)
+- [🏗️ OpenMMLab 的其他项目](#️-openmmlab-的其他项目-)
+- [❤️ 欢迎加入 OpenMMLab 社区](#️-欢迎加入-openmmlab-社区-)
 
-main 分支代码目前支持 **PyTorch 1.6以上** 的版本
+## 🥳 🚀 最新进展 [🔝](#-table-of-contents)
+
+**默认分支已经从 `master` （当前的`0.x`） 切换到 `main`（之前的 `1.x`），我们建议用户更新至最新版本，其支持更多模型，更强的预训练权重，以及更简洁的代码实现。详情请参阅[迁移指南](https://mmaction2.readthedocs.io/zh_cn/latest/migration.html)**
+
+**Release (2023.07.04)**: v1.1.0 支持以下新功能:
+
+- 支持基于 CLIP 的多模态模型: ActionCLIP(Arxiv'2021) 和 CLIP4clip(ArXiv'2022)
+- 支持丰富的 project: 手势识别, 时空行为检测 tutorial, 以及基于 [MMRazor](https://github.com/open-mmlab/mmrazor) 的知识蒸馏
+- 支持 HACS-segments 数据集(ICCV'2019), MultiSports 数据集(ICCV'2021), Kinetics-710 数据集(Arxiv'2022)
+- 支持 VideoMAE V2(CVPR'2023), VideoMAE(NeurIPS'2022) 支持时空行为检测任务
+- 支持 TCANet(CVPR'2021)
+- 支持 [纯 Python 风格的配置文件](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) 和使用 MIM 一键下载数据集
+
+## 📖 简介 [🔝](#-table-of-contents)
+
+MMAction2 是一款基于 PyTorch 开发的行为识别开源工具包，是 [open-mmlab](https://github.com/open-mmlab)  项目的一个子项目。
 
 <div align="center">
-  <div style="float:left;margin-right:10px;">
-  <img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/mmaction2_overview.gif" width="380px"><br>
-    <p style="font-size:1.5vw;">Kinetics-400 上的动作识别</p>
-  </div>
-  <div style="float:right;margin-right:0px;">
-  <img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px"><br>
-    <p style="font-size:1.5vw;">NTURGB+D-120 上的基于人体姿态的动作识别</p>
-  </div>
+  <img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/mmaction2_overview.gif" width="380px">
+  <img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px">
+  <p style="font-size:1.5vw;"> Kinetics-400 数据集行为识别结果（左） 和 NTU-RGB+D-120 数据集基于骨架的行为识别结果（右）</p>
 </div>
+
 <div align="center">
   <img src="https://user-images.githubusercontent.com/30782254/155710881-bb26863e-fcb4-458e-b0c4-33cd79f96901.gif" width="580px"/><br>
-    <p style="font-size:1.5vw;">Kinetics-400 上的基于 skeleton 的时空动作检测和动作识别</p>
+    <p style="font-size:1.5vw;">Kinetics-400 数据集基于骨骼点的时空行为检测及视频行为识别结果</p>
 </div>
 <div align="center">
   <img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/spatio-temporal-det.gif" width="800px"/><br>
-    <p style="font-size:1.5vw;">AVA-2.1 上的时空动作检测</p>
+    <p style="font-size:1.5vw;">AVA-2.1 数据集时空行为检测结果</p>
 </div>
 
-## 主要特性
+## 🎁 主要功能 [🔝](#-table-of-contents)
 
-- **模块设计**：MMAction2 将统一的视频理解框架解耦成不同的模块组件，通过组合不同的模块组件，用户可以便捷地构建自定义的视频理解模型
+- **模块化设计**： 我们将视频理解框架拆分成了不同模块，用户可以很方便地通过组合不同的模块来构建出自定义的视频理解框架。
 
-- **支持多种任务和数据集**：MMAction2 支持多种视频理解任务，包括动作识别，时序动作检测，时空动作检测以及基于人体姿态的动作识别
+- **支持五种主要的视频理解任务**： MMAction2 为视频理解任务实现了多种多样的算法，包括行为识别，时序动作定位，时空动作检测，基于骨骼点的行为识别，以及视频检索。
 
-- **详尽的单元测试和文档**：MMAction2 提供了详尽的说明文档，API 接口说明，全面的单元测试，以供社区参考
+- **详尽的单元测试和文档**：我们提供了详尽的文档和 API 参考手册，以及单元测试。
 
-## 更新记录
+## 🛠️ 安装 [🔝](#-table-of-contents)
 
-**v1.0.0 版本 (2023.04.06)**:
+MMAction2依赖于 [PyTorch](https://pytorch.org/)，[MMCV](https://github.com/open-mmlab/mmcv)，[MMEngine](https://github.com/open-mmlab/mmengine)，[MMDetection](https://github.com/open-mmlab/mmdetection) （可选）和 [MMPose](https://github.com/open-mmlab/mmpose) （可选）
 
-- 支持骨骼动作识别模型 RGB-PoseC3D (CVPR'2022) .
-- 在 Projects 中支持 MSG3D(CVPR'2020) 和 CTRGCN(CVPR'2021).
-- 支持训练 UniFormer V2(Arxiv'2022).
-- 重构升级用户文档
+具体步骤请参考 [安装文档](https://mmaction2.readthedocs.io/zh_cn/latest/get_started/installation.html)。
 
-## 安装
-
-MMAction2 依赖 [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (可选), [MMPose](https://github.com/open-mmlab/mmpose) (可选)，以下是安装的简要步骤。
-更详细的安装指南请参考 [install.md](https://mmaction2.readthedocs.io/zh_CN/latest/get_started.html) 。
+<details close>
+<summary>快速安装</summary>
 
 ```shell
 conda create --name openmmlab python=3.8 -y
 conda activate open-mmlab
-conda install pytorch torchvision -c pytorch  # 以上命令将自动安装最新版本的 PyTorch 和 cudatoolkit,请检查它们是否和你的环境匹配
+conda install pytorch torchvision -c pytorch  # 该命令将自动安装最新版的 PyTorch 和 cudatoolkit，请确认此是否匹配你的当前环境。
 pip install -U openmim
 mim install mmengine
 mim install mmcv
@@ -117,11 +135,19 @@ cd mmaction2
 pip install -v -e .
 ```
 
-## 模型库
+</details>
+
+## 👀 模型库 [🔝](#-table-of-contents)
+
+结果及模型位于[模型库](https://mmaction2.readthedocs.io/zh_cn/latest/modelzoo_statistics.html)
+
+<details close>
+
+<summary>模型支持</summary>
 
 <table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
   <tr>
-    <td colspan="5" style="font-weight:bold;">行为识别方法</td>
+    <td colspan="5" style="font-weight:bold;">行为识别</td>
   </tr>
   <tr>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/c3d/README.md">C3D</a> (CVPR'2014)</td>
@@ -148,138 +174,166 @@ pip install -v -e .
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition_audio/resnet/README.md">MultiModality: Audio</a> (ArXiv'2020)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tanet/README.md">TANet</a> (ArXiv'2020)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/timesformer/README.md">TimeSformer</a> (ICML'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/projects/actionclip/README.md">ActionCLIP</a> (ArXiv'2021)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/README.md">VideoSwin</a> (CVPR'2022)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/README.md">VideoMAE</a> (NeurIPS'2022)</td>
   </tr>
   <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/README.md">VideoMAE</a> (NeurIPS'2022)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/mvit/README.md">MViT V2</a> (CVPR'2022)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformer/README.md">UniFormer V1</a> (ICLR'2022)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformerv2/README.md">UniFormer V2</a> (Arxiv'2022)</td>
-    <td></td>
-    <td></td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomaev2/README.md">VideoMAE V2</a> (CVPR'2023)</td>
   </tr>
   <tr>
-    <td colspan="5" style="font-weight:bold;">时序动作检测方法</td>
+    <td colspan="5" style="font-weight:bold;">时序动作定位</td>
   </tr>
   <tr>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/ssn/README.md">SSN</a> (ICCV'2017)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/bsn/README.md">BSN</a> (ECCV'2018)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/bmn/README.md">BMN</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/localization/tcanet/README.md">TCANet</a> (CVPR'2021)</td>
     <td></td>
     <td></td>
   </tr>
   <tr>
-    <td colspan="5" style="font-weight:bold;">时空动作检测方法</td>
+    <td colspan="5" style="font-weight:bold;">时空行为检测</td>
   </tr>
   <tr>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/acrn/README.md">ACRN</a> (ECCV'2018)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/slowonly/README.md">SlowOnly+Fast R-CNN</a> (ICCV'2019)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/slowfast/README.md">SlowFast+Fast R-CNN</a> (ICCV'2019)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/detection/lfb/README.md">LFB</a> (CVPR'2019)</td>
-    <td></td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/README.md">VideoMAE</a> (NeurIPS'2022)</td>
   </tr>
   <tr>
-    <td colspan="5" style="font-weight:bold;">基于骨骼点的动作识别方法</td>
+    <td colspan="5" style="font-weight:bold;">基于骨骼点的行为识别</td>
   </tr>
   <tr>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/skeleton/stgcn/README.md">ST-GCN</a> (AAAI'2018)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/skeleton/2s-agcn/README.md">2s-AGCN</a> (CVPR'2019)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/skeleton/posec3d/README.md">PoseC3D</a> (CVPR'2022)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/skeleton/stgcnpp/README.md">STGCN++</a> (ArXiv'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/projects/ctrgcn/README.md">CTRGCN</a> (CVPR'2021)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/projects/msg3d/README.md">MSG3D</a> (CVPR'2020)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">视频检索</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/configs/retrieval/clip4clip/README.md">CLIP4Clip</a> (ArXiv'2022)</td>
+    <td></td>
+    <td></td>
+    <td></td>
     <td></td>
   </tr>
+
 </table>
 
-各个模型的结果和设置都可以在对应的 config 目录下的 *README_zh-CN.md* 中查看。整体的概况也可也在 [**模型库**](https://mmaction2.readthedocs.io/zh_CN/latest/modelzoo.html) 页面中查看。
+</details>
 
-MMAction2 将跟进学界的最新进展，并支持更多算法和框架。如果您对 MMAction2 有任何功能需求，请随时在 [问题](https://github.com/open-mmlab/mmaction2/issues/19) 中留言。
+<details close>
 
-## 数据集
+<summary>数据集支持</summary>
 
 <table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
   <tr>
-    <td colspan="4" style="font-weight:bold;">动作识别数据集</td>
+    <td colspan="4" style="font-weight:bold;">行为识别</td>
   </tr>
   <tr>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/hmdb51/README.md">HMDB51</a> (<a href="https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/">Homepage</a>) (ICCV'2011)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ucf101/README.md">UCF101</a> (<a href="https://www.crcv.ucf.edu/research/data-sets/ucf101/">Homepage</a>) (CRCV-IR-12-01)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/activitynet/README.md">ActivityNet</a> (<a href="http://activity-net.org/">Homepage</a>) (CVPR'2015)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md">Kinetics-[400/600/700]</a> (<a href="https://deepmind.com/research/open-source/kinetics/">Homepage</a>) (CVPR'2017)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/hmdb51/README.md">HMDB51</a> (<a href="https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/">官网</a>) (ICCV'2011)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ucf101/README.md">UCF101</a> (<a href="https://www.crcv.ucf.edu/research/data-sets/ucf101/">官网</a>) (CRCV-IR-12-01)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/activitynet/README.md">ActivityNet</a> (<a href="http://activity-net.org/">官网</a>) (CVPR'2015)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md">Kinetics-[400/600/700]</a> (<a href="https://deepmind.com/research/open-source/kinetics/">官网</a>) (CVPR'2017)</td>
   </tr>
   <tr>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/sthv1/README.md">SthV1</a>  (ICCV'2017)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/sthv2/README.md">SthV2</a> (<a href="https://developer.qualcomm.com/software/ai-datasets/something-something">Homepage</a>) (ICCV'2017)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/diving48/README.md">Diving48</a> (<a href="http://www.svcl.ucsd.edu/projects/resound/dataset.html">Homepage</a>) (ECCV'2018)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/jester/README.md">Jester</a> (<a href="https://developer.qualcomm.com/software/ai-datasets/jester">Homepage</a>) (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/sthv2/README.md">SthV2</a> (<a href="https://developer.qualcomm.com/software/ai-datasets/something-something">官网</a>) (ICCV'2017)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/diving48/README.md">Diving48</a> (<a href="http://www.svcl.ucsd.edu/projects/resound/dataset.html">官网</a>) (ECCV'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/jester/README.md">Jester</a> (<a href="https://developer.qualcomm.com/software/ai-datasets/jester">官网</a>) (ICCV'2019)</td>
   </tr>
   <tr>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/mit/README.md">Moments in Time</a> (<a href="http://moments.csail.mit.edu/">Homepage</a>) (TPAMI'2019)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/mmit/README.md">Multi-Moments in Time</a> (<a href="http://moments.csail.mit.edu/challenge_iccv_2019.html">Homepage</a>) (ArXiv'2019)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/hvu/README.md">HVU</a> (<a href="https://github.com/holistic-video-understanding/HVU-Dataset">Homepage</a>) (ECCV'2020)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/omnisource/README.md">OmniSource</a> (<a href="https://kennymckormick.github.io/omnisource/">Homepage</a>) (ECCV'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/mit/README.md">Moments in Time</a> (<a href="http://moments.csail.mit.edu/">官网</a>) (TPAMI'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/mmit/README.md">Multi-Moments in Time</a> (<a href="http://moments.csail.mit.edu/challenge_iccv_2019.html">官网</a>) (ArXiv'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/hvu/README.md">HVU</a> (<a href="https://github.com/holistic-video-understanding/HVU-Dataset">官网</a>) (ECCV'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/omnisource/README.md">OmniSource</a> (<a href="https://kennymckormick.github.io/omnisource/">官网</a>) (ECCV'2020)</td>
   </tr>
   <tr>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/gym/README.md">FineGYM</a> (<a href="https://sdolivia.github.io/FineGym/">Homepage</a>) (CVPR'2020)</td>
-    <td></td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/gym/README.md">FineGYM</a> (<a href="https://sdolivia.github.io/FineGym/">官网</a>) (CVPR'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics710/README.md">Kinetics-710</a> (<a href="https://arxiv.org/pdf/2211.09552.pdf">官网</a>) (Arxiv'2022)</td>
     <td></td>
     <td></td>
   </tr>
   <tr>
-    <td colspan="4" style="font-weight:bold;">时序动作检测数据集</td>
+    <td colspan="4" style="font-weight:bold;">时序动作定位</td>
   </tr>
   <tr>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/thumos14/README.md">THUMOS14</a> (<a href="https://www.crcv.ucf.edu/THUMOS14/download.html">Homepage</a>) (THUMOS Challenge 2014)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/activitynet/README.md">ActivityNet</a> (<a href="http://activity-net.org/">Homepage</a>) (CVPR'2015)</td>
-    <td></td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/thumos14/README.md">THUMOS14</a> (<a href="https://www.crcv.ucf.edu/THUMOS14/download.html">官网</a>) (THUMOS Challenge 2014)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/activitynet/README.md">ActivityNet</a> (<a href="http://activity-net.org/">官网</a>) (CVPR'2015)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/hacs/README.md">HACS</a> (<a href="https://github.com/hangzhaomit/HACS-dataset">官网</a>) (ICCV'2019)</td>
     <td></td>
   </tr>
   <tr>
-    <td colspan="4" style="font-weight:bold;">时空动作检测数据集</td>
+    <td colspan="4" style="font-weight:bold;">时空行为检测</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ucf101_24/README.md">UCF101-24*</a> (<a href="http://www.thumos.info/download.html">官网</a>) (CRCV-IR-12-01)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/jhmdb/README.md">JHMDB*</a> (<a href="http://jhmdb.is.tue.mpg.de/">官网</a>) (ICCV'2015)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ava/README.md">AVA</a> (<a href="https://research.google.com/ava/index.html">官网</a>) (CVPR'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ava_kinetics/README.md">AVA-Kinetics</a> (<a href="https://research.google.com/ava/index.html">官网</a>) (Arxiv'2020)</td>
   </tr>
   <tr>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ucf101_24/README.md">UCF101-24*</a> (<a href="http://www.thumos.info/download.html">Homepage</a>) (CRCV-IR-12-01)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/jhmdb/README.md">JHMDB*</a> (<a href="http://jhmdb.is.tue.mpg.de/">Homepage</a>) (ICCV'2015)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ava/README.md">AVA</a> (<a href="https://research.google.com/ava/index.html">Homepage</a>) (CVPR'2018)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ava_kinetics/README.md">AVA-Kinetics</a> (<a href="https://research.google.com/ava/index.html">Homepage</a>) (Arxiv'2020)</td>
+    <td colspan="4" style="font-weight:bold;">基于骨架的行为识别</td>
   </tr>
   <tr>
-    <td colspan="4" style="font-weight:bold;">基于骨骼点的动作识别数据集</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-FineGYM</a> (<a href="https://kennymckormick.github.io/posec3d/">官网</a>) (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-NTURGB+D</a> (<a href="https://kennymckormick.github.io/posec3d/">官网</a>) (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-UCF101</a> (<a href="https://kennymckormick.github.io/posec3d/">官网</a>) (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-HMDB51</a> (<a href="https://kennymckormick.github.io/posec3d/">官网</a>) (ArXiv'2021)</td>
   </tr>
   <tr>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-FineGYM</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-NTURGB+D</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-UCF101</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md">PoseC3D-HMDB51</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
+    <td colspan="4" style="font-weight:bold;">视频检索</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/main/tools/data/video_retrieval/README.md">MSRVTT</a> (<a href="https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/">官网</a>) (CVPR'2016)</td>
+    <td></td>
+    <td></td>
+    <td></td>
   </tr>
 </table>
 
-标记 * 代表对应数据集并未被完全支持，但提供相应的数据准备步骤。整体的概况也可也在 [**数据集**](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 页面中查看。
-
-## 数据集准备
-
-请参考 [数据准备](https://mmaction2.readthedocs.io/en/latest/user_guides/2_data_prepare.html) 了解数据集准备概况。所有支持的数据集都列于 [数据集清单](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 中。
+</details>
 
-## FAQ
+## 👨‍🏫 新手入门 [🔝](#-table-of-contents)
 
-请参考 [FAQ](docs/zh_cn/notes/faq.md) 了解其他用户的常见问题。
+我们提供了一系列简明的教程，帮助新用户轻松上手使用：
 
-## 相关工作
+- [从 MMAction2 0.X 迁移](https://mmaction2.readthedocs.io/zh_cn/latest/migration.html)
+- [学习配置相关知识](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/config.html)
+- [准备数据集](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/prepare_dataset.html)
+- [使用现有模型进行推理](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/inference.html)
+- [训练与测试](https://mmaction2.readthedocs.io/zh_cn/latest/user_guides/train_test.html)
 
-目前有许多研究工作或工程项目基于 MMAction2 搭建，例如：
+<details close>
+<summary>基于 MMAction2 的社区工作</summary>
 
-- Video Swin Transformer. [\[论文\]](https://arxiv.org/abs/2106.13230)[\[代码\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
-- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[论文\]](https://arxiv.org/abs/2107.10161)[\[代码\]](https://github.com/Cogito2012/DEAR)
-- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[论文\]](https://arxiv.org/abs/2103.17263)[\[代码\]](https://github.com/xvjiarui/VFS)
+- Video Swin Transformer. [\[paper\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
+- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR)
+- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS)
 
-更多详情可见 [相关工作](docs/en/notes/projects.md) 。
+</details>
 
-## 许可
+## 🎫 许可证 [🔝](#-table-of-contents)
 
-该项目开源自 [Apache 2.0 license](LICENSE).
+本项目基于 [Apache 2.0 license](LICENSE) 发布。
 
-## 引用
+## 🖊️ 引用 [🔝](#-table-of-contents)
 
-如果你觉得 MMAction2 对你的研究有所帮助，可以考虑引用它：
+如你发现本项目对你的研究有帮助，请参考如下 bibtex 引用 MMAction2。
 
 ```BibTeX
 @misc{2020mmaction2,
@@ -290,33 +344,55 @@ MMAction2 将跟进学界的最新进展，并支持更多算法和框架。如
 }
 ```
 
-## 参与贡献
+## 🙌 参与贡献 [🔝](#-table-of-contents)
 
-我们非常欢迎用户对于 MMAction2 做出的任何贡献，可以参考 [贡献指南](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING_zh-CN.md) 文件了解更多细节。
+我们感谢所有的贡献者为改进和提升 MMAction2 所作出的努力。请参考[贡献指南](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING.md)来了解参与项目贡献的相关指引。
 
-## 致谢
+## 🤝 致谢 [🔝](#-table-of-contents)
 
-MMAction2 是一款由不同学校和公司共同贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者，以及提供宝贵反馈的用户。
-我们希望该工具箱和基准测试可以为社区提供灵活的代码工具，供用户复现现有算法并开发自己的新模型，从而不断为开源社区提供贡献。
+MMAction2 是一款由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者，以及提供宝贵反馈的用户。 我们希望此工具箱可以帮助大家来复现已有的方法和开发新的方法，从而为研究社区贡献力量。
 
-## OpenMMLab 的其他项目
+## 🏗️ OpenMMLab 的其他项目 [🔝](#-table-of-contents)
 
 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMEval](https://github.com/open-mmlab/mmeval): 统一开放的跨框架算法评测库
 - [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab 深度学习预训练工具箱
-- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab 新一代人工智能内容生成（AIGC）工具箱
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
 - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
 - [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
 - [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
 - [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
 - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
 - [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
+- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab 新一代人工智能内容生成（AIGC）工具箱
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
 - [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
-- [MIM](https://github.com/open-mmlab/mim): OpenMMlab 项目、算法、模型的统一入口
 - [Playground](https://github.com/open-mmlab/playground): 收集和展示 OpenMMLab 相关的前沿、有趣的社区项目
+
+## ❤️ 欢迎加入 OpenMMLab 社区 [🔝](#-table-of-contents)
+
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=aCvMxdr3) 或联络 OpenMMLab 官方微信小助手
+
+<div align="center">
+<img src="./resources/zhihu_qrcode.jpg" height="400"/> <img src="./resources/xiaozhushou_weixin_qrcode.jpeg" height="400"/>
+</div>
+
+我们会在 OpenMMLab 社区为大家
+
+- 📢 分享 AI 框架的前沿核心技术
+- 💻 解读 PyTorch 常用模块源码
+- 📰 发布 OpenMMLab 的相关新闻
+- 🚀 介绍 OpenMMLab 开发的前沿算法
+- 🏃 获取更高效的问题答疑和意见反馈
+- 🔥 提供与各行各业开发者充分交流的平台
+
+干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬
diff --git a/configs/detection/lfb/README.md b/configs/detection/lfb/README.md
index 93f3f6cbed..39aa77fa69 100644
--- a/configs/detection/lfb/README.md
+++ b/configs/detection/lfb/README.md
@@ -73,7 +73,7 @@ Example: train LFB model on AVA with half-precision long-term feature bank.
 
 ```shell
 python tools/train.py configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py \
-  --validate --seed 0 --deterministic
+  --seed 0 --deterministic
 ```
 
 For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py
index cbd11d67d4..2ba637545c 100644
--- a/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py
+++ b/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py
@@ -1,19 +1,69 @@
 # This config is used to generate long-term feature bank.
-_base_ = [
-    '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = ['../../_base_/default_runtime.py']
 
 # model settings
 lfb_prefix_path = 'data/ava/lfb_half'
 dataset_mode = 'val'  # ['train', 'val', 'test']
 
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+       'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+       'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+       'kinetics400-rgb_20220901-e7b65fad.pth')
+
 model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
     roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5),
         shared_head=dict(
             type='LFBInferHead',
             lfb_prefix_path=lfb_prefix_path,
             dataset_mode=dataset_mode,
-            use_half_precision=True)))
+            use_half_precision=True)),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        _scope_='mmaction',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
 
 # dataset settings
 dataset_type = 'AVADataset'
diff --git a/configs/detection/slowfast/README.md b/configs/detection/slowfast/README.md
index f82273adcc..985877134d 100644
--- a/configs/detection/slowfast/README.md
+++ b/configs/detection/slowfast/README.md
@@ -34,10 +34,20 @@ We present SlowFast networks for video recognition. Our model involves (i) a Slo
 |          8x8x1          |  8   |     SlowFast ResNet50 (temporal-max)      | Kinetics-400 | 26.41 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
 |          8x8x1          |  8   | SlowFast ResNet50 (temporal-max, focal loss) | Kinetics-400 | 26.65 | [config](/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
 
+### MultiSports
+
+| frame sampling strategy | gpus |     backbone      |   pretrain   | f-mAP | v-mAP@0.2 | v-mAP@0.5 | v-mAP@0.1:0.9 | gpu_mem(M) |               config               |               ckpt               |               log                |
+| :---------------------: | :--: | :---------------: | :----------: | :---: | :-------: | :-------: | :-----------: | :--------: | :--------------------------------: | :------------------------------: | :------------------------------: |
+|         4x16x1          |  8   | SlowFast ResNet50 | Kinetics-400 | 36.88 |   22.83   |   16.9    |     14.74     |   18618    | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-af666368.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.log) |
+
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
 2. **with context** indicates that using both RoI feature and global pooled feature for classification; **temporal-max** indicates that using max pooling in the temporal dimension for the feature.
+3. MultiSports dataset utilizes frame-mAP(f-mAP) and video-mAP(v-map) to evaluate performance. Frame-mAP evaluates on detection results of each frame, and video-mAP uses 3D IoU to evaluate tube-level results under several thresholds. You could refer to the [competition page](https://codalab.lisn.upsaclay.fr/competitions/3736#learn_the_details-evaluation) for details.
+
+For more details on data preparation, you can refer to
 
-For more details on data preparation, you can refer to [AVA](/tools/data/ava/README.md).
+- [AVA](/tools/data/ava/README.md)
+- [MultiSports](/tools/data/multisports/README.md)
 
 ## Train
 
diff --git a/configs/detection/slowfast/metafile.yml b/configs/detection/slowfast/metafile.yml
index 2ab6c44a45..65c8e84bf1 100644
--- a/configs/detection/slowfast/metafile.yml
+++ b/configs/detection/slowfast/metafile.yml
@@ -119,3 +119,23 @@ Models:
               mAP: 26.65
     Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth
+
+  - Name: slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb
+    Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py
+    In Collection: SlowFast
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 16
+      Epochs: 8
+      Pretrained: Kinetics-400
+      Resolution: short-side 320
+      Training Data: MultiSports
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: MultiSports
+        Task: Action Detection
+        Metrics:
+              f-mAP: 36.88
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-af666368.pth
diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py
new file mode 100644
index 0000000000..f3b0bdeb3f
--- /dev/null
+++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py
@@ -0,0 +1,134 @@
+_base_ = [
+    '../slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py'  # noqa: E501
+]
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+       'slowfast_r50_4x16x1_256e_kinetics400_rgb/'
+       'slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth')
+num_classes = 66
+model = dict(
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        _delete_=True,
+        type='mmaction.ResNet3dSlowFast',
+        pretrained=None,
+        resample_rate=8,
+        speed_ratio=8,
+        channel_ratio=8,
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            spatial_strides=(1, 2, 2, 1)),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            spatial_strides=(1, 2, 2, 1))),
+    roi_head=dict(bbox_head=dict(in_channels=2304)))
+
+dataset_type = 'AVADataset'
+data_root = 'data/multisports/trainval'
+anno_root = 'data/multisports/annotations'
+
+ann_file_train = f'{anno_root}/multisports_train.csv'
+ann_file_val = f'{anno_root}/multisports_val.csv'
+gt_file = f'{anno_root}/multisports_GT.pkl'
+
+proposal_file_train = f'{anno_root}/multisports_dense_proposals_train.recall_96.13.pkl'  # noqa: E501
+proposal_file_val = f'{anno_root}/multisports_dense_proposals_val.recall_96.13.pkl'  # noqa: E501
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
+    dict(type='DecordDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        num_classes=num_classes,
+        proposal_file=proposal_file_train,
+        data_prefix=dict(img=data_root),
+        timestamp_start=1,
+        start_index=0,
+        use_frames=False,
+        fps=1,
+    ))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        num_classes=num_classes,
+        proposal_file=proposal_file_val,
+        data_prefix=dict(img=data_root),
+        test_mode=True,
+        timestamp_start=1,
+        start_index=0,
+        use_frames=False,
+        fps=1,
+    ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='MultiSportsMetric', ann_file=gt_file)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=8, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6, 7],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01125, momentum=0.9, weight_decay=0.00001),
+    clip_grad=dict(max_norm=5, norm_type=2))
diff --git a/configs/detection/slowonly/README.md b/configs/detection/slowonly/README.md
index ff0f7bf641..05c1de8b0e 100644
--- a/configs/detection/slowonly/README.md
+++ b/configs/detection/slowonly/README.md
@@ -52,13 +52,21 @@ We conduct ablation studies to show the improvements of training tricks using Sl
 |      + focal loss      |          8x8x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 30.33 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb_20221205-37aa8395.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.log) |
 |     + more frames      |         16x4x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 31.29 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb_20221205-dd652f81.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.log) |
 
+### MultiSports
+
+| frame sampling strategy | gpus |     backbone      |   pretrain   | f-mAP | v-mAP@0.2 | v-mAP@0.5 | v-mAP@0.1:0.9 | gpu_mem(M) |               config               |               ckpt               |               log                |
+| :---------------------: | :--: | :---------------: | :----------: | :---: | :-------: | :-------: | :-----------: | :--------: | :--------------------------------: | :------------------------------: | :------------------------------: |
+|         4x16x1          |  8   | SlowOnly ResNet50 | Kinetics-400 | 26.40 |   15.48   |   10.62   |     9.65      |    8509    | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.log) |
+
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
 2. **+ context** indicates that using both RoI feature and global pooled feature for classification; **+ temporal max pooling** indicates that using max pooling in the temporal dimension for the feature; **nonlinear head** indicates that using a 2-layer mlp instead of a linear classifier.
+3. MultiSports dataset utilizes frame-mAP(f-mAP) and video-mAP(v-map) to evaluate performance. Frame-mAP evaluates on detection results of each frame, and video-mAP uses 3D IoU to evaluate tube-level results under several thresholds. You could refer to the [competition page](https://codalab.lisn.upsaclay.fr/competitions/3736#learn_the_details-evaluation) for details.
 
 For more details on data preparation, you can refer to
 
 - [AVA](/tools/data/ava/README.md)
 - [AVA-Kinetics](/tools/data/ava_kinetics/README.md)
+- [MultiSports](/tools/data/multisports/README.md)
 
 ## Train
 
diff --git a/configs/detection/slowonly/metafile.yml b/configs/detection/slowonly/metafile.yml
index 11ca749351..c73f051211 100644
--- a/configs/detection/slowonly/metafile.yml
+++ b/configs/detection/slowonly/metafile.yml
@@ -100,3 +100,23 @@ Models:
               mAP: 24.83
     Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth
+
+  - Name: slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb
+    Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py
+    In Collection: SlowOnly
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 16
+      Epochs: 8
+      Pretrained: Kinetics-400
+      Resolution: short-side 320
+      Training Data: MultiSports
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: MultiSports
+        Task: Action Detection
+        Metrics:
+              f-mAP: 26.40
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth
diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py
new file mode 100644
index 0000000000..0d83ca0d48
--- /dev/null
+++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py
@@ -0,0 +1,150 @@
+_base_ = [
+    '../../_base_/default_runtime.py',
+]
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+       'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+       'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+       'kinetics400-rgb_20220901-e7b65fad.pth')
+num_classes = 66
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=num_classes,
+            multilabel=False,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/multisports/trainval'
+anno_root = 'data/multisports/annotations'
+
+ann_file_train = f'{anno_root}/multisports_train.csv'
+ann_file_val = f'{anno_root}/multisports_val.csv'
+gt_file = f'{anno_root}/multisports_GT.pkl'
+
+proposal_file_train = f'{anno_root}/multisports_dense_proposals_train.recall_96.13.pkl'  # noqa: E501
+proposal_file_val = f'{anno_root}/multisports_dense_proposals_val.recall_96.13.pkl'  # noqa: E501
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='DecordDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        num_classes=num_classes,
+        proposal_file=proposal_file_train,
+        data_prefix=dict(img=data_root),
+        timestamp_start=1,
+        start_index=0,
+        use_frames=False,
+        fps=1,
+    ))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        num_classes=num_classes,
+        proposal_file=proposal_file_val,
+        data_prefix=dict(img=data_root),
+        test_mode=True,
+        timestamp_start=1,
+        start_index=0,
+        use_frames=False,
+        fps=1,
+    ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='MultiSportsMetric', ann_file=gt_file)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=8, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6, 7],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+    clip_grad=dict(max_norm=5, norm_type=2))
diff --git a/configs/detection/videomae/README.md b/configs/detection/videomae/README.md
new file mode 100644
index 0000000000..9e1af3fd39
--- /dev/null
+++ b/configs/detection/videomae/README.md
@@ -0,0 +1,75 @@
+# VideoMAE
+
+[VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Pre-training video transformers on extra large-scale datasets is generally required to achieve premier performance on relatively small datasets. In this paper, we show that video masked autoencoders (VideoMAE) are data-efficient learners for self-supervised video pre-training (SSVP). We are inspired by the recent ImageMAE and propose customized video tube masking with an extremely high ratio. This simple design makes video reconstruction a more challenging self-supervision task, thus encouraging extracting more effective video representations during this pre-training process. We obtain three important findings on SSVP: (1) An extremely high proportion of masking ratio (i.e., 90% to 95%) still yields favorable performance of VideoMAE. The temporally redundant video content enables a higher masking ratio than that of images. (2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. (3) VideoMAE shows that data quality is more important than data quantity for SSVP. Domain shift between pre-training and target datasets is an important issue. Notably, our VideoMAE with the vanilla ViT can achieve 87.4% on Kinetics-400, 75.4% on Something-Something V2, 91.3% on UCF101, and 62.6% on HMDB51, without using any extra data.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/35267818/191656296-14f28f4a-203f-4eeb-a4c3-c2efdb6d1ab4.png" width="800"/>
+</div>
+
+## Results and Models
+
+## Results and Models
+
+### AVA2.2
+
+Currently, we use the training set of AVA-Kinetics and evaluate on the AVA2.2 validation dataset.
+
+| frame sampling strategy | resolution | gpus | backbone  |   pretrain   | mAP  |                     config                      |                      ckpt                      |                      log                      |
+| :---------------------: | :--------: | :--: | :-------: | :----------: | :--: | :---------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: |
+|         16x4x1          |    raw     |  8   | ViT Base  | Kinetics-400 | 33.6 | [config](/configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb_20230314-3dafab75.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.log) |
+|         16x4x1          |    raw     |  8   | ViT Large | Kinetics-400 | 38.7 | [config](/configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb_20230314-bf93c9ea.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.log) |
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train the ViT base model on AVA-Kinetics in a deterministic option.
+
+```shell
+python tools/train.py configs/detection/ava_kinetics/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py \
+    --cfg-options randomness.seed=0 randomness.deterministic=True
+```
+
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test the ViT base model on AVA-Kinetics and dump the result to a pkl file.
+
+```shell
+python tools/test.py configs/detection/ava_kinetics/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Citation
+
+```BibTeX
+@inproceedings{tong2022videomae,
+  title={Video{MAE}: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training},
+  author={Zhan Tong and Yibing Song and Jue Wang and Limin Wang},
+  booktitle={Advances in Neural Information Processing Systems},
+  year={2022}
+}
+```
diff --git a/configs/detection/videomae/metafile.yml b/configs/detection/videomae/metafile.yml
new file mode 100644
index 0000000000..bfaa082a55
--- /dev/null
+++ b/configs/detection/videomae/metafile.yml
@@ -0,0 +1,47 @@
+Collections:
+- Name: VideoMAE
+  README: configs/detection/videomae/README.md
+  Paper:
+    URL: https://arxiv.org/abs/2203.12602
+    Title: "VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training"
+
+Models:
+  - Name: vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb
+    Config: configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py
+    In Collection: VideoMAE
+    Metadata:
+      Architecture: Vision Transformer
+      Batch Size: 64
+      Epochs: 20
+      Pretrained: Kinetics-400
+      Resolution: short-side 320
+      Training Data: AVA v2.2
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.2
+        Task: Action Detection
+        Metrics:
+              mAP: 33.6
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb_20230314-3dafab75.pth
+
+  - Name: vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb
+    Config: configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py
+    In Collection: VideoMAE
+    Metadata:
+      Architecture: Vision Transformer
+      Batch Size: 128
+      Epochs: 20
+      Pretrained: Kinetics-400
+      Resolution: short-side 320
+      Training Data: AVA v2.2
+      Training Resources: 32 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.2
+        Task: Action Detection
+        Metrics:
+              mAP: 38.7
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb_20230314-bf93c9ea.pth
diff --git a/configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py b/configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py
new file mode 100644
index 0000000000..6e5950b847
--- /dev/null
+++ b/configs/detection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py
@@ -0,0 +1,170 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+url = (
+    'https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/'
+    'vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-860a3cd3.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.VisionTransformer',
+        img_size=224,
+        patch_size=16,
+        embed_dims=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        num_frames=16,
+        norm_cfg=dict(type='LN', eps=1e-6),
+        drop_path_rate=0.2,
+        use_mean_pooling=False,
+        return_feat_map=True),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=768,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        _scope_='mmaction',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVAKineticsDataset'
+data_root = 'data/ava_kinetics/rawframes'
+anno_root = 'data/ava_kinetics/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.2.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.2.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.2.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.2.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.2_for_activitynet_2019.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=16, frame_interval=4),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=16, frame_interval=4, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        data_prefix=dict(img=data_root),
+        test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='AVAMetric',
+    ann_file=ann_file_val,
+    label_file=label_file,
+    exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=15,
+        eta_min=0,
+        by_epoch=True,
+        begin=5,
+        end=20,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=1.25e-4, weight_decay=0.05),
+    constructor='LearningRateDecayOptimizerConstructor',
+    paramwise_cfg={
+        'decay_rate': 0.75,
+        'decay_type': 'layer_wise',
+        'num_layers': 12
+    },
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py b/configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py
new file mode 100644
index 0000000000..229f3ae013
--- /dev/null
+++ b/configs/detection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py
@@ -0,0 +1,171 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+url = (
+    'https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/'
+    'vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-229dbb03.pth'
+)
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.VisionTransformer',
+        img_size=224,
+        patch_size=16,
+        embed_dims=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        num_frames=16,
+        norm_cfg=dict(type='LN', eps=1e-6),
+        drop_path_rate=0.2,
+        use_mean_pooling=False,
+        return_feat_map=True),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=1024,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        _scope_='mmaction',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVAKineticsDataset'
+data_root = 'data/ava_kinetics/rawframes'
+anno_root = 'data/ava_kinetics/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.2.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.2.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.2.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.2.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.2_for_activitynet_2019.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=16, frame_interval=4),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=16, frame_interval=4, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        data_prefix=dict(img=data_root),
+        test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='AVAMetric',
+    ann_file=ann_file_val,
+    label_file=label_file,
+    exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=15,
+        eta_min=0,
+        by_epoch=True,
+        begin=5,
+        end=20,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=2.5e-4, weight_decay=0.05),
+    constructor='LearningRateDecayOptimizerConstructor',
+    paramwise_cfg={
+        'decay_rate': 0.8,
+        'decay_type': 'layer_wise',
+        'num_layers': 24
+    },
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/localization/bsn/metafile.yml b/configs/localization/bsn/metafile.yml
index b662010552..693c67d840 100644
--- a/configs/localization/bsn/metafile.yml
+++ b/configs/localization/bsn/metafile.yml
@@ -1,5 +1,5 @@
 Collections:
-- Name: BMN
+- Name: BSN
   README: configs/localization/bsn/README.md
   Paper:
     URL: https://arxiv.org/abs/1806.02964
diff --git a/configs/localization/tcanet/README.md b/configs/localization/tcanet/README.md
new file mode 100644
index 0000000000..41404db58f
--- /dev/null
+++ b/configs/localization/tcanet/README.md
@@ -0,0 +1,66 @@
+# TCANet
+
+[Temporal Context Aggregation Network for Temporal Action Proposal Refinement](https://openaccess.thecvf.com/content/CVPR2021/papers/Qing_Temporal_Context_Aggregation_Network_for_Temporal_Action_Proposal_Refinement_CVPR_2021_paper.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Temporal action proposal generation aims to estimate temporal intervals of actions in untrimmed videos, which is a challenging yet important task in the video understanding field.
+The proposals generated by current methods still suffer from inaccurate temporal boundaries and inferior confidence used for retrieval owing to the lack of efficient temporal modeling and effective boundary context utilization.
+In this paper, we propose Temporal Context Aggregation Network (TCANet) to generate high-quality action proposals through `local and global` temporal context aggregation and complementary as well as progressive boundary refinement.
+Specifically, we first design a Local-Global Temporal Encoder (LGTE), which adopts the channel grouping strategy to efficiently encode both `local and global` temporal inter-dependencies.
+Furthermore, both the boundary and internal context of proposals are adopted for frame-level and segment-level boundary regressions, respectively.
+Temporal Boundary Regressor (TBR) is designed to combine these two regression granularities in an end-to-end fashion, which achieves the precise boundaries and reliable confidence of proposals through progressive refinement. Extensive experiments are conducted on three challenging datasets: HACS, ActivityNet-v1.3, and THUMOS-14, where TCANet can generate proposals with high precision and recall. By combining with the existing action classifier, TCANet can obtain remarkable temporal action detection performance compared with other methods. Not surprisingly, the proposed TCANet won the 1$^{st}$ place in the CVPR 2020 - HACS challenge leaderboard on temporal action localization task.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/35267818/223302449-8891241c-e84a-4c74-bf31-073d6a75b33a.png" width="800"/>
+</div>
+
+## Results and Models
+
+### HACS dataset
+
+| feature  | gpus | pretrain |  AUC  | AR@1 | AR@5  | AR@10 | AR@100 | gpu_mem(M) | iter time(s) |                     config                     |                     ckpt                     |                     log                     |
+| :------: | :--: | :------: | :---: | :--: | :---: | :---: | :----: | :--------: | :----------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+| SlowOnly |  2   |   None   | 51.39 | 3.61 | 16.92 | 21.94 | 62.80  |     -      |      -       | [config](/configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature_20230621-d6bc10b0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.log) |
+
+For more details on data preparation, you can refer to [HACS Data Preparation](/tools/data/hacs/README.md).
+
+## Train
+
+Train TCANet model on HACS dataset with the SlowOnly feature.
+
+```shell
+bash tools/dist_train.sh configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py 2
+```
+
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Test
+
+Test TCANet model on HACS dataset with the SlowOnly feature.
+
+```shell
+python3 tools/test.py  configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py CHECKPOINT.PTH
+```
+
+For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Citation
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{qing2021temporal,
+  title={Temporal Context Aggregation Network for Temporal Action Proposal Refinement},
+  author={Qing, Zhiwu and Su, Haisheng and Gan, Weihao and Wang, Dongliang and Wu, Wei and Wang, Xiang and Qiao, Yu and Yan, Junjie and Gao, Changxin and Sang, Nong},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={485--494},
+  year={2021}
+}
+```
diff --git a/configs/localization/tcanet/metafile.yml b/configs/localization/tcanet/metafile.yml
new file mode 100644
index 0000000000..d1fa3b940a
--- /dev/null
+++ b/configs/localization/tcanet/metafile.yml
@@ -0,0 +1,29 @@
+Collections:
+- Name: TCANET
+  README: configs/localization/tcanet/README.md
+  Paper:
+    URL: https://arxiv.org/abs/2103.13141
+    Title: "Temporal Context Aggregation Network for Temporal Action Proposal Refinement"
+
+Models:
+  - Name: tcanet_2xb8-2048x100-9e_hacs-feature.py
+    Config: configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py
+    In Collection: TCANET
+    Metadata:
+      Batch Size: 8
+      Epochs: 9
+      Training Data: HACS
+      Training Resources: 2 GPUs
+      feature: slowonly
+    Modality: RGB
+    Results:
+      - Dataset: HACS
+        Task: Temporal Action Localization
+        Metrics:
+              AUC: 51.39
+              AR@1: 3.61
+              AR@5: 16.92
+              AR@10: 21.94
+              AR@100: 62.80
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature_20230621-d6bc10b0.pth
diff --git a/configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py b/configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py
new file mode 100644
index 0000000000..4b1efc0ede
--- /dev/null
+++ b/configs/localization/tcanet/tcanet_2xb8-700x100-9e_hacs-feature.py
@@ -0,0 +1,131 @@
+_base_ = '../../_base_/default_runtime.py'
+
+# model settings
+model = dict(
+    type='TCANet',
+    feat_dim=700,
+    se_sample_num=32,
+    action_sample_num=64,
+    temporal_dim=100,
+    window_size=9,
+    lgte_num=2,
+    soft_nms_alpha=0.4,
+    soft_nms_low_threshold=0.0,
+    soft_nms_high_threshold=0.0,
+    post_process_top_k=100,
+    feature_extraction_interval=16)
+
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/HACS/slowonly_feature/'
+data_root_val = 'data/HACS/slowonly_feature/'
+ann_file_train = 'data/HACS/hacs_anno_train.json'
+ann_file_val = 'data/HACS/hacs_anno_val.json'
+ann_file_test = 'data/HACS/hacs_anno_val.json'
+
+train_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='PackLocalizationInputs',
+        keys=('gt_bbox', 'proposals'),
+        meta_keys=('video_name', ))
+]
+
+val_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='PackLocalizationInputs',
+        keys=('gt_bbox', 'proposals'),
+        meta_keys=(
+            'video_name',
+            'duration_second',
+            'duration_frame',
+            'annotations',
+            'feature_frame',
+        ))
+]
+
+test_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(
+        type='PackLocalizationInputs',
+        keys=('gt_bbox', 'proposals'),
+        meta_keys=(
+            'video_name',
+            'duration_second',
+            'duration_frame',
+            'annotations',
+            'feature_frame',
+        ))
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    drop_last=True,
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+max_epochs = 9
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=max_epochs,
+    val_begin=1,
+    val_interval=1)
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    optimizer=dict(type='Adam', lr=0.001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[
+            7,
+        ],
+        gamma=0.1)
+]
+
+work_dir = './work_dirs/tcanet_2xb8-2048x100-9e_hacs-feature/'
+test_evaluator = dict(
+    type='ANetMetric',
+    metric_type='AR@AN',
+    dump_config=dict(out=f'{work_dir}/results.json', output_format='json'))
+val_evaluator = test_evaluator
diff --git a/configs/recognition/mvit/README.md b/configs/recognition/mvit/README.md
index 122341bcc7..8f867d8bee 100644
--- a/configs/recognition/mvit/README.md
+++ b/configs/recognition/mvit/README.md
@@ -63,7 +63,7 @@ the corresponding result without repeat augment is as follows:
 
 | frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc |      reference top1 acc       |       reference top5 acc       | testing protocol | FLOPs | params |       config       |       ckpt       |       log       |
 | :---------------------: | :--------: | :------: | :------: | :------: | :------: | :---------------------------: | :----------------------------: | :--------------: | :---: | :----: | :----------------: | :--------------: | :-------------: |
-|       uniform 16        |  224x224   | MViTv2-S |   K400   |   68.2   |   91.3   | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop |  64G  | 34.4M  | [config](/configs/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb_20230201-4065c1b9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.log) |
+|       uniform 16        |  224x224   | MViTv2-S |   K400   |   68.2   |   91.3   | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop |  64G  | 34.4M  | [config](/configs/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb_20230201-4065c1b9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb/mvit-small-p244_k400-pre_16xb16-u16-100e_sthv2-rgb.log) |
 
 For more details on data preparation, you can refer to
 
diff --git a/configs/recognition/omnisource/README.md b/configs/recognition/omnisource/README.md
index f3397d3bb1..b5c16144e2 100644
--- a/configs/recognition/omnisource/README.md
+++ b/configs/recognition/omnisource/README.md
@@ -77,3 +77,12 @@ For more details, you can refer to the **Test** part in the [Training and Test T
   year={2019}
 }
 ```
+
+```BibTeX
+@article{duan2020omni,
+  title={Omni-sourced Webly-supervised Learning for Video Recognition},
+  author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua},
+  journal={arXiv preprint arXiv:2003.13042},
+  year={2020}
+}
+```
diff --git a/configs/recognition/omnisource/metafile.yml b/configs/recognition/omnisource/metafile.yml
index af4524e5b0..e622e9f06f 100644
--- a/configs/recognition/omnisource/metafile.yml
+++ b/configs/recognition/omnisource/metafile.yml
@@ -1,7 +1,9 @@
 Collections:
   - Name: Omnisource
     README: configs/recognition/omnisource/README.md
-
+    Paper:
+      URL: https://arxiv.org/abs/2003.13042
+      Title: 'Omni-sourced Webly-supervised Learning for Video Recognition'
 
 Models:
   - Name: slowonly_r50_8xb16-8x8x1-256e_imagenet-kinetics400-rgb
diff --git a/configs/recognition/slowonly/README.md b/configs/recognition/slowonly/README.md
index 78a3e043e3..bd95e7f69a 100644
--- a/configs/recognition/slowonly/README.md
+++ b/configs/recognition/slowonly/README.md
@@ -37,6 +37,12 @@ We present SlowFast networks for video recognition. Our model involves (i) a Slo
 |         4x16x1          | Linear+MultiStep |  224x224   | 8x2  | ResNet50 | ImageNet |  65.52   |  86.39   | 10 clips x 3 crop | 27.38G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb_20221013-98b1b0a7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.log) |
 |          8x8x1          | Linear+MultiStep |  224x224   | 8x2  | ResNet50 | ImageNet |  67.67   |  87.80   | 10 clips x 3 crop | 54.75G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.log) |
 
+### Kinetics-710
+
+| frame sampling strategy |    scheduler     | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol  | FLOPs  | params |             config             |             ckpt             |             log              |
+| :---------------------: | :--------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :----------------------------: | :--------------------------: | :--------------------------: |
+|          8x8x1          | Linear+MultiStep |  224x224   | 8x4  | ResNet50 | ImageNet |  72.39   |  90.60   | 10 clips x 3 crop | 54.75G | 32.45M | [config](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb_20230612-12ce977c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.log) |
+
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
 
diff --git a/configs/recognition/slowonly/metafile.yml b/configs/recognition/slowonly/metafile.yml
index 488e11b1c8..61a478e459 100644
--- a/configs/recognition/slowonly/metafile.yml
+++ b/configs/recognition/slowonly/metafile.yml
@@ -214,3 +214,26 @@ Models:
           Top 5 Accuracy: 87.47
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20220901-4098e1eb.pth
+
+  - Name: slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb
+    Config: configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py
+    In Collection: SlowOnly
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 8
+      Epochs: 150
+      FLOPs: 54.75G
+      Parameters: 32.45M
+      Pretrained: ImageNet
+      Resolution: short-side 320
+      Training Data: Kinetics-710
+      Training Resources: 32 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: Kinetics-710
+        Task: Action Recognition
+        Metrics:
+          Top 1 Accuracy: 72.39
+          Top 5 Accuracy: 90.60
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb_20230612-12ce977c.pth
diff --git a/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py
new file mode 100644
index 0000000000..035ce29fc2
--- /dev/null
+++ b/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_32xb8-8x8x1-steplr-150e_kinetics710-rgb.py
@@ -0,0 +1,138 @@
+_base_ = [('slowonly_imagenet-pretrained-r50_16xb16-'
+           '4x16x1-steplr-150e_kinetics700-rgb.py')]
+
+model = dict(cls_head=dict(num_classes=710))
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+k400_data_root = 'data/kinetics400/videos_train'
+k600_data_root = 'data/kinetics600/videos'
+k700_data_root = 'data/kinetics700/videos'
+k400_data_root_val = 'data/kinetics400/videos_val'
+k600_data_root_val = k600_data_root
+k700_data_root_val = k700_data_root
+
+k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt'
+k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt'
+k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt'
+
+k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt'
+k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt'
+k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt'
+
+k400_trainset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_train,
+    data_prefix=dict(video=k400_data_root),
+    pipeline=train_pipeline)
+k600_trainset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_train,
+    data_prefix=dict(video=k600_data_root),
+    pipeline=train_pipeline)
+k700_trainset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_train,
+    data_prefix=dict(video=k700_data_root),
+    pipeline=train_pipeline)
+
+k400_valset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_val,
+    data_prefix=dict(video=k400_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k600_valset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_val,
+    data_prefix=dict(video=k600_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k700_valset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_val,
+    data_prefix=dict(video=k700_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+
+k400_testset = k400_valset.copy()
+k600_testset = k600_valset.copy()
+k700_testset = k700_valset.copy()
+k400_testset['pipeline'] = test_pipeline
+k600_testset['pipeline'] = test_pipeline
+k700_testset['pipeline'] = test_pipeline
+
+k710_trainset = dict(
+    type='ConcatDataset',
+    datasets=[k400_trainset, k600_trainset, k700_trainset],
+    _delete_=True)
+k710_valset = dict(
+    type='ConcatDataset',
+    datasets=[k400_valset, k600_valset, k700_valset],
+    _delete_=True)
+k710_testset = dict(
+    type='ConcatDataset',
+    datasets=[k400_testset, k600_testset, k700_testset],
+    _delete_=True,
+)
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=k710_trainset)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_valset)
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_testset)
diff --git a/configs/recognition/swin/README.md b/configs/recognition/swin/README.md
index 1156c4a679..1935dbaa67 100644
--- a/configs/recognition/swin/README.md
+++ b/configs/recognition/swin/README.md
@@ -31,7 +31,13 @@ The vision community is witnessing a modeling shift from CNNs to Transformers, w
 
 | frame sampling strategy | resolution | gpus | backbone |   pretrain   | top1 acc | top5 acc | testing protocol | FLOPs | params |               config                |               ckpt                |               log                |
 | :---------------------: | :--------: | :--: | :------: | :----------: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-|         32x2x1          |  224x224   |  16  |  Swin-L  | ImageNet-22k |  75.92   |  92.72   | 4 clips x 3 crop | 604G  |  197M  | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py.log) |
+|         32x2x1          |  224x224   |  16  |  Swin-L  | ImageNet-22k |  75.92   |  92.72   | 4 clips x 3 crop | 604G  |  197M  | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.log) |
+
+### Kinetics-710
+
+| frame sampling strategy | resolution | gpus | backbone |  pretrain   | top1 acc | top5 acc | testing protocol | FLOPs | params |               config                |               ckpt                |                log                |
+| :---------------------: | :--------: | :--: | :------: | :---------: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: |
+|         32x2x1          |  224x224   |  32  |  Swin-S  | ImageNet-1k |  76.90   |  92.96   | 4 clips x 3 crop | 604G  |  197M  | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb_20230612-8e082ff1.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.log) |
 
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
 2. The values in columns named after "reference" are the results got by testing on our dataset, using the checkpoints provided by the author with same model settings. `*` means that the numbers are copied from the paper.
diff --git a/configs/recognition/swin/metafile.yml b/configs/recognition/swin/metafile.yml
index 0a4cc41cb7..db40a707f3 100644
--- a/configs/recognition/swin/metafile.yml
+++ b/configs/recognition/swin/metafile.yml
@@ -120,3 +120,26 @@ Models:
         Top 5 Accuracy: 92.72
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth
+
+  - Name: swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb
+    Config: configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py
+    In Collection: Swin
+    Metadata:
+      Architecture: Swin-S
+      Batch Size: 4
+      Epochs: 30
+      FLOPs: 604G
+      Parameters: 197M
+      Pretrained: ImageNet-1K
+      Resolution: 224x224
+      Training Data: Kinetics-710
+      Training Resources: 32 GPUs
+    Modality: RGB
+    Results:
+    - Dataset: Kinetics-710
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 76.90
+        Top 5 Accuracy: 92.96
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb_20230612-8e082ff1.pth
diff --git a/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py
new file mode 100644
index 0000000000..a3706ddcfc
--- /dev/null
+++ b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_32xb4-amp-32x2x1-30e_kinetics710-rgb.py
@@ -0,0 +1,144 @@
+_base_ = [
+    'swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py'
+]
+
+model = dict(cls_head=dict(num_classes=710))
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+k400_data_root = 'data/kinetics400/videos_train'
+k600_data_root = 'data/kinetics600/videos'
+k700_data_root = 'data/kinetics700/videos'
+k400_data_root_val = 'data/kinetics400/videos_val'
+k600_data_root_val = k600_data_root
+k700_data_root_val = k700_data_root
+
+k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt'
+k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt'
+k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt'
+
+k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt'
+k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt'
+k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt'
+
+k400_trainset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_train,
+    data_prefix=dict(video=k400_data_root),
+    pipeline=train_pipeline)
+k600_trainset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_train,
+    data_prefix=dict(video=k600_data_root),
+    pipeline=train_pipeline)
+k700_trainset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_train,
+    data_prefix=dict(video=k700_data_root),
+    pipeline=train_pipeline)
+
+k400_valset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_val,
+    data_prefix=dict(video=k400_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k600_valset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_val,
+    data_prefix=dict(video=k600_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k700_valset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_val,
+    data_prefix=dict(video=k700_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+
+k400_testset = k400_valset.copy()
+k600_testset = k600_valset.copy()
+k700_testset = k700_valset.copy()
+k400_testset['pipeline'] = test_pipeline
+k600_testset['pipeline'] = test_pipeline
+k700_testset['pipeline'] = test_pipeline
+
+k710_trainset = dict(
+    type='ConcatDataset',
+    datasets=[k400_trainset, k600_trainset, k700_trainset],
+    _delete_=True)
+k710_valset = dict(
+    type='ConcatDataset',
+    datasets=[k400_valset, k600_valset, k700_valset],
+    _delete_=True)
+k710_testset = dict(
+    type='ConcatDataset',
+    datasets=[k400_testset, k600_testset, k700_testset],
+    _delete_=True,
+)
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=k710_trainset)
+val_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_valset)
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_testset)
+
+optim_wrapper = dict(optimizer=dict(lr=2e-3))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/recognition/tin/README.md b/configs/recognition/tin/README.md
index abadd02f4f..1e3db9145b 100644
--- a/configs/recognition/tin/README.md
+++ b/configs/recognition/tin/README.md
@@ -50,7 +50,11 @@ Here, we use `finetune` to indicate that we use [TSM model](https://download.ope
 
 :::
 
-For more details on data preparation, you can refer to Kinetics400, Something-Something V1 and Something-Something V2 in [Prepare Datasets](/docs/en/user_guides/2_data_prepare.md).
+For more details on data preparation, you can refer to
+
+- [Kinetics](/tools/data/kinetics/README.md)
+- [Something-something V1](/tools/data/sthv1/README.md)
+- [Something-something V2](/tools/data/sthv2/README.md)
 
 ## Train
 
diff --git a/configs/recognition/tpn/README.md b/configs/recognition/tpn/README.md
index cb1af4b6b2..6b5e5ef5cf 100644
--- a/configs/recognition/tpn/README.md
+++ b/configs/recognition/tpn/README.md
@@ -41,7 +41,11 @@ Visual tempo characterizes the dynamics and the temporal scale of an action. Mod
 
 :::
 
-For more details on data preparation, you can refer to Kinetics400, Something-Something V1 and Something-Something V2 in [Data Preparation](/docs/data_preparation.md).
+For more details on data preparation, you can refer to
+
+- [Kinetics](/tools/data/kinetics/README.md)
+- [Something-something V1](/tools/data/sthv1/README.md)
+- [Something-something V2](/tools/data/sthv2/README.md)
 
 ## Train
 
diff --git a/configs/recognition/tpn/metafile.yml b/configs/recognition/tpn/metafile.yml
index ce953f2e89..ba5f64f806 100644
--- a/configs/recognition/tpn/metafile.yml
+++ b/configs/recognition/tpn/metafile.yml
@@ -23,7 +23,7 @@ Models:
   - Dataset: Kinetics-400
     Metrics:
       Top 1 Accuracy: 74.20
-      top5 accuracy: 91.48
+      Top 5 Accuracy: 91.48
     Task: Action Recognition
   Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.log
   Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb_20220913-97d0835d.pth
@@ -45,7 +45,7 @@ Models:
   - Dataset: Kinetics-400
     Metrics:
       Top 1 Accuracy: 76.74
-      top5 accuracy: 92.57
+      Top 5 Accuracy: 92.57
     Task: Action Recognition
   Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb.log
   Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_imagenet-pretrained-r50_8xb8-8x8x1-150e_kinetics400-rgb_20220913-fed3f4c1.pth
diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md
index 97c1b33e34..3014d0e26b 100644
--- a/configs/recognition/tsm/README.md
+++ b/configs/recognition/tsm/README.md
@@ -29,6 +29,7 @@ The explosive growth in video streaming gives rise to challenges on performing v
 |          1x1x8          |  224x224   |  8   | ResNet50 (NonLocalDotProduct) | ImageNet |  74.49   |  91.15   | 8 clips x 10 crop  | 61.30G | 31.68M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb_20220831-108bfde5.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.log) |
 |          1x1x8          |  224x224   |  8   |   ResNet50 (NonLocalGauss)    | ImageNet |  73.66   |  90.99   | 8 clips x 10 crop  | 59.06G | 28.00M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb_20220831-7e54dacf.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.log) |
 |          1x1x8          |  224x224   |  8   | ResNet50 (NonLocalEmbedGauss) | ImageNet |  74.34   |  91.23   | 8 clips x 10 crop  | 61.30G | 31.68M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb_20220831-35eddb57.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-embedded-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.log) |
+|          1x1x8          |  224x224   |  8   |          MobileNetV2          | ImageNet |  68.71   |  88.32   |  8 clips x 3 crop  | 3.269G | 2.736M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb_20230414-401127fd.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.log) |
 
 ### Something-something V2
 
diff --git a/configs/recognition/tsm/metafile.yml b/configs/recognition/tsm/metafile.yml
index 64d37461d4..409f5a95df 100644
--- a/configs/recognition/tsm/metafile.yml
+++ b/configs/recognition/tsm/metafile.yml
@@ -167,6 +167,29 @@ Models:
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-gaussian_8xb16-1x1x8-50e_kinetics400-rgb_20220831-7e54dacf.pth
 
+  - Name: tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb
+    Config: configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py
+    In Collection: TSM
+    Metadata:
+      Architecture: MobileNetV2
+      Batch Size: 16
+      Epochs: 100
+      FLOPs: 3.269G
+      Parameters: 2.736M
+      Pretrained: ImageNet
+      Resolution: 224x224
+      Training Data: Kinetics-400
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 63.70
+        Top 5 Accuracy: 88.28
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth
+
   - Name: tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb
     Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py
     In Collection: TSM
diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py
similarity index 91%
rename from configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py
rename to configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py
index 32c276647f..4fe084b401 100644
--- a/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py
+++ b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py
@@ -53,7 +53,7 @@
         test_mode=True),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
-    dict(type='TenCrop', crop_size=224),
+    dict(type='ThreeCrop', crop_size=256),
     dict(type='FormatShape', input_format='NCHW'),
     dict(type='PackActionInputs')
 ]
@@ -97,29 +97,28 @@
 default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3))
 
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+    type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
 val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
 
 param_scheduler = [
-    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
     dict(
         type='MultiStepLR',
         begin=0,
-        end=50,
+        end=100,
         by_epoch=True,
-        milestones=[25, 45],
+        milestones=[40, 80],
         gamma=0.1)
 ]
 
 optim_wrapper = dict(
     constructor='TSMOptimWrapperConstructor',
     paramwise_cfg=dict(fc_lr5=True),
-    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001),
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00002),
     clip_grad=dict(max_norm=20, norm_type=2))
 
 # Default setting for scaling LR automatically
 #   - `enable` means enable scaling LR automatically
 #       or not by default.
 #   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
-auto_scale_lr = dict(enable=False, base_batch_size=128)
+auto_scale_lr = dict(enable=True, base_batch_size=128)
diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md
index 61a65ace30..8ff8222649 100644
--- a/configs/recognition/tsn/README.md
+++ b/configs/recognition/tsn/README.md
@@ -48,6 +48,7 @@ It's possible and convenient to use a 3rd-party backbone for TSN under the frame
 |          1x1x3          | MultiStep |  224x224   |  8   |    ResNext101    | ImageNet |  72.95   |  90.36   | 25 clips x 10 crop | 200.3G | 42.95M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb_20221209-de2d5615.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-rn101-32x4d_8xb32-1x1x3-100e_kinetics400-rgb.log) |
 |          1x1x3          | MultiStep |  224x224   |  8   |   DenseNet161    | ImageNet |  72.07   |  90.15   | 25 clips x 10 crop | 194.6G | 27.36M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb_20220906-5f4c0daf.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.log) |
 |          1x1x3          | MultiStep |  224x224   |  8   | Swin Transformer | ImageNet |  77.03   |  92.61   | 25 clips x 10 crop | 386.7G | 87.15M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.log) |
+|          1x1x8          | MultiStep |  224x224   |  8   | Swin Transformer | ImageNet |  79.22   |  94.20   | 25 clips x 10 crop | 386.7G | 87.15M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb_20230530-428f0064.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.log) |
 
 1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details.
 2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
diff --git a/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py
new file mode 100644
index 0000000000..9999f2b3b0
--- /dev/null
+++ b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py
@@ -0,0 +1,21 @@
+_base_ = ['../tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py']
+
+model = dict(
+    backbone=dict(
+        type='timm.swin_base_patch4_window7_224',
+        pretrained=True,
+        _delete_=True),
+    cls_head=dict(in_channels=1024))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,
+        by_epoch=True,
+        milestones=[20, 40],
+        gamma=0.1)
+]
diff --git a/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py
index b92e738e20..a549c0d924 100644
--- a/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py
+++ b/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py
@@ -4,5 +4,6 @@
     backbone=dict(
         type='timm.swin_base_patch4_window7_224',
         pretrained=True,
+        feature_shape='NHWC',
         _delete_=True),
     cls_head=dict(in_channels=1024))
diff --git a/configs/recognition/tsn/metafile.yml b/configs/recognition/tsn/metafile.yml
index 37943e673b..378040098c 100644
--- a/configs/recognition/tsn/metafile.yml
+++ b/configs/recognition/tsn/metafile.yml
@@ -173,7 +173,7 @@ Models:
     Config: configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py
     In Collection: TSN
     Metadata:
-      Architecture: ResNet50
+      Architecture: Swin-base
       Batch Size: 32
       Epochs: 100
       FLOPs: 386.7G
@@ -192,6 +192,29 @@ Models:
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth
 
+  - Name: tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb
+    Config: configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py
+    In Collection: TSN
+    Metadata:
+      Architecture: Swin-base
+      Batch Size: 32
+      Epochs: 100
+      FLOPs: 386.7G
+      Parameters: 87.15M
+      Pretrained: ImageNet
+      Resolution: 224x224
+      Training Data: Kinetics-400
+      Training Resources: 32 GPUs
+    Modality: RGB
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 79.22
+        Top 5 Accuracy: 94.20
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth
+
   - Name: tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb
     Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py
     In Collection: TSN
diff --git a/configs/recognition/uniformer/README.md b/configs/recognition/uniformer/README.md
index bf19cb31ca..6d04b7920e 100644
--- a/configs/recognition/uniformer/README.md
+++ b/configs/recognition/uniformer/README.md
@@ -20,11 +20,11 @@ It is a challenging task to learn rich and multi-scale spatiotemporal semantics
 
 ### Kinetics-400
 
-| frame sampling strategy |   resolution   |  backbone   | top1 acc | top5 acc | [reference](<(https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md)>) top1 acc | [reference](<(https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                              config                                               |                                                                           ckpt                                                                           |
-| :---------------------: | :------------: | :---------: | :------: | :------: | :-----------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|         16x4x1          | short-side 320 | UniFormer-S |   80.9   |   94.6   |                                                  80.8                                                   |                                                  94.7                                                   |         80.9         |         94.6         | 4 clips x 1 crop | 41.8G | 21.4M  | [config](/configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-c630a037.pth) |
-|         16x4x1          | short-side 320 | UniFormer-B |   82.0   |   95.0   |                                                  82.0                                                   |                                                  95.1                                                   |         82.0         |         95.0         | 4 clips x 1 crop | 96.7G | 49.8M  | [config](/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb.py)  | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-157c2e66.pth)  |
-|         32x4x1          | short-side 320 | UniFormer-B |   83.1   |   95.3   |                                                  82.9                                                   |                                                  95.4                                                   |         83.0         |         95.3         | 4 clips x 1 crop |  59G  | 49.8M  | [config](/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py)  | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb_20221219-b776322c.pth)  |
+| frame sampling strategy |   resolution   |  backbone   | top1 acc | top5 acc | [reference](https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md) top1 acc | [reference](https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                              config                                               |                                                                           ckpt                                                                           |
+| :---------------------: | :------------: | :---------: | :------: | :------: | :-------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|         16x4x1          | short-side 320 | UniFormer-S |   80.9   |   94.6   |                                                80.8                                                 |                                                94.7                                                 |         80.9         |         94.6         | 4 clips x 1 crop | 41.8G | 21.4M  | [config](/configs/recognition/uniformer/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-small_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-c630a037.pth) |
+|         16x4x1          | short-side 320 | UniFormer-B |   82.0   |   95.0   |                                                82.0                                                 |                                                95.1                                                 |         82.0         |         95.0         | 4 clips x 1 crop | 96.7G | 49.8M  | [config](/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb.py)  | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_16x4x1_kinetics400-rgb_20221219-157c2e66.pth)  |
+|         32x4x1          | short-side 320 | UniFormer-B |   83.1   |   95.3   |                                                82.9                                                 |                                                95.4                                                 |         83.0         |         95.3         | 4 clips x 1 crop |  59G  | 49.8M  | [config](/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py)  | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb_20221219-b776322c.pth)  |
 
 The models are ported from the repo [UniFormer](https://github.com/Sense-X/UniFormer/blob/main/video_classification/README.md) and tested on our data. Currently, we only support the testing of UniFormer models, training will be available soon.
 
diff --git a/configs/recognition/uniformerv2/README.md b/configs/recognition/uniformerv2/README.md
index e090751b33..35fcb6c5fe 100644
--- a/configs/recognition/uniformerv2/README.md
+++ b/configs/recognition/uniformerv2/README.md
@@ -20,60 +20,60 @@ Learning discriminative spatiotemporal representation is the key problem of vide
 
 ### Kinetics-400
 
-| uniform sampling |   resolution   |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
-| :--------------: | :------------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|        8         | short-side 320 |    UniFormerV2-B/16    |       clip       |    -     |    -     |                                           84.3                                            |                                           96.4                                            |         84.4         |         96.3         | 4 clips x 3 crop | 0.1T  |  115M  |         [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py)         |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log)                 |
-|        8         | short-side 320 |    UniFormerV2-B/16    | clip-kinetics710 |    -     |    -     |                                           85.6                                            |                                           97.0                                            |         85.8         |         97.1         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log) |
-|        8         | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   88.7   |   98.1   |                                           88.8                                            |                                           98.1                                            |         88.7         |         98.1         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth)                                   |                                                                                                                  -                                                                                                                  |
-|        16        | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.0   |   98.2   |                                           89.1                                            |                                           98.2                                            |         89.0         |         98.2         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth)                                  |                                                                                                                  -                                                                                                                  |
-|        32        | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.3   |   98.2   |                                           89.3                                            |                                           98.2                                            |         89.4         |         98.2         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth)                                  |                                                                                                                  -                                                                                                                  |
-|        32        | short-side 320 | UniFormerV2-L/14@336\* | clip-kinetics710 |   89.5   |   98.4   |                                           89.7                                            |                                           98.3                                            |         89.5         |         98.4         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth)                                  |                                                                                                                  -                                                                                                                  |
+| uniform sampling |   resolution   |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :--------------: | :------------: | :--------------------: | :--------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         | short-side 320 |    UniFormerV2-B/16    |       clip       |    -     |    -     |                                         84.3                                          |                                         96.4                                          |         84.4         |         96.3         | 4 clips x 3 crop | 0.1T  |  115M  |         [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py)         |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log)                 |
+|        8         | short-side 320 |    UniFormerV2-B/16    | clip-kinetics710 |    -     |    -     |                                         85.6                                          |                                         97.0                                          |         85.8         |         97.1         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log) |
+|        8         | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   88.7   |   98.1   |                                         88.8                                          |                                         98.1                                          |         88.7         |         98.1         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth)                                   |                                                                                                                  -                                                                                                                  |
+|        16        | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.0   |   98.2   |                                         89.1                                          |                                         98.2                                          |         89.0         |         98.2         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.3   |   98.2   |                                         89.3                                          |                                         98.2                                          |         89.4         |         98.2         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        | short-side 320 | UniFormerV2-L/14@336\* | clip-kinetics710 |   89.5   |   98.4   |                                         89.7                                          |                                         98.3                                          |         89.5         |         98.4         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth)                                  |                                                                                                                  -                                                                                                                  |
 
 ### Kinetics-600
 
-| uniform sampling | resolution |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
-| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710 |    -     |    -     |                                           86.1                                            |                                           97.2                                            |         86.4         |         97.3         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log) |
-|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.0   |   98.3   |                                           89.0                                            |                                           98.2                                            |         87.5         |         98.0         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth)                                   |                                                                                                                  -                                                                                                                  |
-|        16        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.4   |   98.3   |                                           89.4                                            |                                           98.3                                            |         87.8         |         98.0         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth)                                  |                                                                                                                  -                                                                                                                  |
-|        32        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.2   |   98.3   |                                           89.5                                            |                                           98.3                                            |         87.7         |         98.1         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth)                                  |                                                                                                                  -                                                                                                                  |
-|        32        |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710 |   89.8   |   98.5   |                                           89.9                                            |                                           98.5                                            |         88.8         |         98.3         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth)                                  |                                                                                                                  -                                                                                                                  |
+| uniform sampling | resolution |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710 |    -     |    -     |                                         86.1                                          |                                         97.2                                          |         86.4         |         97.3         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.0   |   98.3   |                                         89.0                                          |                                         98.2                                          |         87.5         |         98.0         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth)                                   |                                                                                                                  -                                                                                                                  |
+|        16        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.4   |   98.3   |                                         89.4                                          |                                         98.3                                          |         87.8         |         98.0         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.2   |   98.3   |                                         89.5                                          |                                         98.3                                          |         87.7         |         98.1         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710 |   89.8   |   98.5   |                                         89.9                                          |                                         98.5                                          |         88.8         |         98.3         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth)                                  |                                                                                                                  -                                                                                                                  |
 
 ### Kinetics-700
 
-| uniform sampling | resolution |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
-| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|        8         |    Raw     |    UniFormerV2-B/16    |       clip       |    -     |    -     |                                           75.8                                            |                                           92.8                                            |         75.9         |         92.9         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log)                 |
-|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710 |    -     |    -     |                                           76.3                                            |                                           92.7                                            |         76.3         |         92.9         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log) |
-|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   80.8   |   95.2   |                                           80.8                                            |                                           95.4                                            |         79.4         |         94.8         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth)                                   |                                                                                                                  -                                                                                                                  |
-|        16        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   81.2   |   95.6   |                                           81.2                                            |                                           95.6                                            |         79.2         |         95.0         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth)                                  |                                                                                                                  -                                                                                                                  |
-|        32        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   81.4   |   95.7   |                                           81.5                                            |                                           95.7                                            |         79.8         |         95.3         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth)                                  |                                                                                                                  -                                                                                                                  |
-|        32        |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710 |   82.1   |   96.0   |                                           82.1                                            |                                           96.1                                            |         80.6         |         95.6         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth)                                  |                                                                                                                  -                                                                                                                  |
+| uniform sampling | resolution |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         |    Raw     |    UniFormerV2-B/16    |       clip       |    -     |    -     |                                         75.8                                          |                                         92.8                                          |         75.9         |         92.9         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log)                 |
+|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710 |    -     |    -     |                                         76.3                                          |                                         92.7                                          |         76.3         |         92.9         | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   80.8   |   95.2   |                                         80.8                                          |                                         95.4                                          |         79.4         |         94.8         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth)                                   |                                                                                                                  -                                                                                                                  |
+|        16        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   81.2   |   95.6   |                                         81.2                                          |                                         95.6                                          |         79.2         |         95.0         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   81.4   |   95.7   |                                         81.5                                          |                                         95.7                                          |         79.8         |         95.3         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710 |   82.1   |   96.0   |                                         82.1                                          |                                         96.1                                          |         80.6         |         95.6         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth)                                  |                                                                                                                  -                                                                                                                  |
 
 ### MiTv1
 
-| uniform sampling | resolution |        backbone        |           pretrain           | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | testing protocol | FLOPs | params |                                                              config                                                              |                                                                                                                                   ckpt                                                                                                                                   |                                                                                                                          log                                                                                                                          |
-| :--------------: | :--------: | :--------------------: | :--------------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710-kinetics400 |   42.3   |   71.5   |                                           42.6                                            |                                           71.7                                            | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log) |
-|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710-kinetics400 |   47.0   |   76.1   |                                           47.0                                            |                                           76.1                                            | 4 clips x 3 crop | 0.7T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth)                                           |                                                                                                                           -                                                                                                                           |
-|        8         |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710-kinetics400 |   47.7   |   76.8   |                                           47.8                                            |                                           76.0                                            | 4 clips x 3 crop | 1.6T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth)                                           |                                                                                                                           -                                                                                                                           |
+| uniform sampling | resolution |        backbone        |           pretrain           | top1 acc | top5 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top1 acc | [reference](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) top5 acc | testing protocol | FLOPs | params |                                                              config                                                              |                                                                                                                                   ckpt                                                                                                                                   |                                                                                                                          log                                                                                                                          |
+| :--------------: | :--------: | :--------------------: | :--------------------------: | :------: | :------: | :-----------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710-kinetics400 |   42.3   |   71.5   |                                         42.6                                          |                                         71.7                                          | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710-kinetics400 |   47.0   |   76.1   |                                         47.0                                          |                                         76.1                                          | 4 clips x 3 crop | 0.7T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth)                                           |                                                                                                                           -                                                                                                                           |
+|        8         |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710-kinetics400 |   47.7   |   76.8   |                                         47.8                                          |                                         76.0                                          | 4 clips x 3 crop | 1.6T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth)                                           |                                                                                                                           -                                                                                                                           |
 
 ### Kinetics-710
 
-| uniform sampling | resolution |        backbone        | pretrain |                                  config                                   |                                  ckpt                                   |
-| :--------------: | :--------: | :--------------------: | :------: | :-----------------------------------------------------------------------: | :---------------------------------------------------------------------: |
-|        8         |    Raw     |   UniFormerV2-B/16\*   |   clip   | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth) |
-|        8         |    Raw     |   UniFormerV2-L/14\*   |   clip   | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20221219-bfaae587.pth) |
-|        8         |    Raw     | UniFormerV2-L/14@336\* |   clip   | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth) |
+| uniform sampling | resolution |        backbone        | pretrain | top1 acc | top5 acc |                    config                    |                    ckpt                    |                    log                     |
+| :--------------: | :--------: | :--------------------: | :------: | :------: | :------: | :------------------------------------------: | :----------------------------------------: | :----------------------------------------: |
+|        8         |    Raw     |   UniFormerV2-B/16\*   |   clip   |   78.9   |   94.2   | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20230612-63cdbad9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   |   clip   |    -     |    -     | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20230612-d002a407.pth) |                     -                      |
+|        8         |    Raw     | UniFormerV2-L/14@336\* |   clip   |    -     |    -     | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20230612-d723ddc1.pth) |                     -                      |
 
 The models with * are ported from the repo [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) and tested on our data. Due to computational limitations, we only support reliable training config for base model (i.e. UniFormerV2-B/16).
 
 1. The values in columns named after "reference" are the results of the original repo.
 2. The values in `top1/5 acc` is tested on the same data list as the original repo, and the label map is provided by [UniFormerV2](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL).
 3. The values in columns named after "mm-Kinetics" are the testing results on the Kinetics dataset held by MMAction2, which is also used by other models in MMAction2. Due to the differences between various versions of Kinetics dataset, there is a little gap between `top1/5 acc` and `mm-Kinetics top1/5 acc`. For a fair comparison with other models, we report both results here. Note that we simply report the inference results, since the training set is different between UniFormer and other models, the results are lower than that tested on the author's version.
-4. Since the original models for Kinetics-400/600/700 adopt different [label file](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL), we simply map the weight according to the label name. New label map for Kinetics-400/600/700 can be found [here](https://github.com/open-mmlab/mmaction2/tree/main/tools/data/kinetics).
+4. Since the original models for Kinetics-400/600/700 adopt different [label file](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL), we simply map the weight according to the label name. New label map for Kinetics-400/600/700 can be found [here](/tools/data/kinetics).
 5. Due to some differences between [SlowFast](https://github.com/facebookresearch/SlowFast) and MMAction2, there are some gaps between their performances.
-6. Kinetics-710 is used for pretraining, which helps improve the performance on other datasets efficiently. You can find more details in the [paper](https://arxiv.org/abs/2211.09552).
+6. Kinetics-710 is used for pretraining, which helps improve the performance on other datasets efficiently. You can find more details in the [paper](https://arxiv.org/abs/2211.09552). We also map the wegiht for Kinetics-710 checkpoints, you can find the label map [here](/tools/data/kinetics710/label_map_k710.txt).
 
 For more details on data preparation, you can refer to
 
diff --git a/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt b/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt
deleted file mode 100644
index 150f3447b4..0000000000
--- a/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt
+++ /dev/null
@@ -1,710 +0,0 @@
-riding a bike	0
-marching	1
-dodgeball	2
-playing cymbals	3
-checking tires	4
-roller skating	5
-tasting beer	6
-clapping	7
-drawing	8
-juggling fire	9
-bobsledding	10
-petting animal (not cat)	11
-spray painting	12
-training dog	13
-eating watermelon	14
-building cabinet	15
-applauding	16
-playing harp	17
-inflating balloons	18
-sled dog racing	19
-wrestling	20
-pole vault	21
-hurling (sport)	22
-riding scooter	23
-shearing sheep	24
-sweeping floor	25
-eating carrots	26
-skateboarding	27
-dunking basketball	28
-disc golfing	29
-eating spaghetti	30
-playing flute	31
-riding mechanical bull	32
-making sushi	33
-trapezing	34
-picking apples	35
-stretching leg	36
-playing ukulele	37
-tying necktie	38
-skydiving	39
-playing cello	40
-jumping into pool	41
-shooting goal (soccer)	42
-trimming trees	43
-bookbinding	44
-ski jumping	45
-walking the dog	46
-riding unicycle	47
-shaving head	48
-hopscotch	49
-playing piano	50
-parasailing	51
-bartending	52
-kicking field goal	53
-finger snapping	54
-dining	55
-yawning	56
-peeling potatoes	57
-canoeing or kayaking	58
-front raises	59
-laughing	60
-dancing macarena	61
-digging	62
-reading newspaper	63
-hitting baseball	64
-clay pottery making	65
-exercising with an exercise ball	66
-playing saxophone	67
-shooting basketball	68
-washing hair	69
-lunge	70
-brushing hair	71
-curling hair	72
-kitesurfing	73
-tapping guitar	74
-bending back	75
-skipping rope	76
-situp	77
-folding paper	78
-cracking neck	79
-assembling computer	80
-cleaning gutters	81
-blowing out candles	82
-shaking hands	83
-dancing gangnam style	84
-windsurfing	85
-tap dancing	86
-skiing mono	87
-bandaging	88
-push up	89
-doing nails	90
-punching person (boxing)	91
-bouncing on trampoline	92
-scrambling eggs	93
-singing	94
-cleaning floor	95
-krumping	96
-drumming fingers	97
-snowmobiling	98
-gymnastics tumbling	99
-headbanging	100
-catching or throwing frisbee	101
-riding elephant	102
-bee keeping	103
-feeding birds	104
-snatch weight lifting	105
-mowing lawn	106
-fixing hair	107
-playing trumpet	108
-flying kite	109
-crossing river	110
-swinging legs	111
-sanding floor	112
-belly dancing	113
-sneezing	114
-clean and jerk	115
-side kick	116
-filling eyebrows	117
-shuffling cards	118
-recording music	119
-cartwheeling	120
-feeding fish	121
-folding clothes	122
-water skiing	123
-tobogganing	124
-blowing leaves	125
-smoking	126
-unboxing	127
-tai chi	128
-waxing legs	129
-riding camel	130
-slapping	131
-tossing salad	132
-capoeira	133
-playing cards	134
-playing organ	135
-playing violin	136
-playing drums	137
-tapping pen	138
-vault	139
-shoveling snow	140
-playing tennis	141
-getting a tattoo	142
-making a sandwich	143
-making tea	144
-grinding meat	145
-squat	146
-eating doughnuts	147
-ice fishing	148
-snowkiting	149
-kicking soccer ball	150
-playing controller	151
-giving or receiving award	152
-welding	153
-throwing discus	154
-throwing axe	155
-ripping paper	156
-swimming butterfly stroke	157
-air drumming	158
-blowing nose	159
-hockey stop	160
-taking a shower	161
-bench pressing	162
-planting trees	163
-pumping fist	164
-climbing tree	165
-tickling	166
-high kick	167
-waiting in line	168
-slacklining	169
-tango dancing	170
-hurdling	171
-carrying baby	172
-celebrating	173
-sharpening knives	174
-passing American football (in game)	175
-headbutting	176
-playing recorder	177
-brush painting	178
-person collecting garbage	179
-robot dancing	180
-shredding paper	181
-pumping gas	182
-rock climbing	183
-hula hooping	184
-braiding hair	185
-opening present	186
-texting	187
-decorating the christmas tree	188
-answering questions	189
-playing keyboard	190
-writing	191
-bungee jumping	192
-smelling feet	193
-eating burger	194
-playing accordion	195
-making pizza	196
-playing volleyball	197
-tasting food	198
-pushing cart	199
-spinning poi	200
-cleaning windows	201
-arm wrestling	202
-changing oil	203
-swimming breast stroke	204
-tossing coin	205
-deadlifting	206
-hoverboarding	207
-cutting watermelon	208
-cheerleading	209
-snorkeling	210
-washing hands	211
-eating cake	212
-pull ups	213
-surfing water	214
-eating hotdog	215
-holding snake	216
-playing harmonica	217
-ironing	218
-cutting nails	219
-golf chipping	220
-shot put	221
-hugging (not baby)	222
-playing clarinet	223
-faceplanting	224
-trimming or shaving beard	225
-drinking shots	226
-riding mountain bike	227
-tying bow tie	228
-swinging on something	229
-skiing crosscountry	230
-unloading truck	231
-cleaning pool	232
-jogging	233
-ice climbing	234
-mopping floor	235
-making the bed	236
-diving cliff	237
-washing dishes	238
-grooming dog	239
-weaving basket	240
-frying vegetables	241
-stomping grapes	242
-moving furniture	243
-cooking sausages (not on barbeque)	244
-doing laundry	245
-dyeing hair	246
-knitting	247
-reading book	248
-baby waking up	249
-punching bag	250
-surfing crowd	251
-cooking chicken	252
-pushing car	253
-springboard diving	254
-swing dancing	255
-massaging legs	256
-beatboxing	257
-breading or breadcrumbing	258
-somersaulting	259
-brushing teeth	260
-stretching arm	261
-juggling balls	262
-massaging person's head	263
-eating ice cream	264
-extinguishing fire	265
-hammer throw	266
-whistling	267
-crawling baby	268
-using remote controller (not gaming)	269
-playing cricket	270
-opening bottle (not wine)	271
-playing xylophone	272
-motorcycling	273
-driving car	274
-exercising arm	275
-passing American football (not in game)	276
-playing kickball	277
-sticking tongue out	278
-flipping pancake	279
-catching fish	280
-eating chips	281
-shaking head	282
-sword fighting	283
-playing poker	284
-cooking on campfire	285
-doing aerobics	286
-paragliding	287
-using segway	288
-folding napkins	289
-playing bagpipes	290
-gargling	291
-skiing slalom	292
-strumming guitar	293
-javelin throw	294
-waxing back	295
-riding or walking with horse	296
-plastering	297
-long jump	298
-parkour	299
-wrapping present	300
-egg hunting	301
-archery	302
-cleaning toilet	303
-swimming backstroke	304
-snowboarding	305
-catching or throwing baseball	306
-massaging back	307
-blowing glass	308
-playing guitar	309
-playing chess	310
-golf driving	311
-presenting weather forecast	312
-rock scissors paper	313
-high jump	314
-baking cookies	315
-using computer	316
-washing feet	317
-arranging flowers	318
-playing bass guitar	319
-spraying	320
-cutting pineapple	321
-waxing chest	322
-auctioning	323
-jetskiing	324
-sipping cup	325
-busking	326
-playing monopoly	327
-salsa dancing	328
-waxing eyebrows	329
-watering plants	330
-zumba	331
-chopping wood	332
-pushing wheelchair	333
-carving pumpkin	334
-building shed	335
-making jewelry	336
-catching or throwing softball	337
-bending metal	338
-ice skating	339
-dancing charleston	340
-abseiling	341
-climbing a rope	342
-crying	343
-cleaning shoes	344
-dancing ballet	345
-driving tractor	346
-triple jump	347
-throwing ball	348
-getting a haircut	349
-running on treadmill	350
-climbing ladder	351
-blasting sand	352
-playing trombone	353
-drop kicking	354
-country line dancing	355
-changing wheel (not on bike)	356
-feeding goats	357
-tying knot (not on a tie)	358
-setting table	359
-shaving legs	360
-kissing	361
-riding mule	362
-counting money	363
-laying bricks	364
-barbequing	365
-news anchoring	366
-smoking hookah	367
-cooking egg	368
-peeling apples	369
-yoga	370
-sharpening pencil	371
-dribbling basketball	372
-petting cat	373
-playing ice hockey	374
-milking cow	375
-shining shoes	376
-juggling soccer ball	377
-scuba diving	378
-playing squash or racquetball	379
-drinking beer	380
-sign language interpreting	381
-playing basketball	382
-breakdancing	383
-testifying	384
-making snowman	385
-golf putting	386
-playing didgeridoo	387
-biking through snow	388
-sailing	389
-jumpstyle dancing	390
-water sliding	391
-grooming horse	392
-massaging feet	393
-playing paintball	394
-making a cake	395
-bowling	396
-contact juggling	397
-applying cream	398
-playing badminton	399
-poaching eggs	400
-playing nose flute	401
-entering church	402
-closing door	403
-helmet diving	404
-doing sudoku	405
-coughing	406
-seasoning food	407
-peeling banana	408
-eating nachos	409
-waxing armpits	410
-shouting	411
-silent disco	412
-polishing furniture	413
-taking photo	414
-dealing cards	415
-putting wallpaper on wall	416
-uncorking champagne	417
-curling eyelashes	418
-brushing floor	419
-pulling espresso shot	420
-playing american football	421
-grooming cat	422
-playing checkers	423
-moving child	424
-stacking cups	425
-squeezing orange	426
-opening coconuts	427
-rolling eyes	428
-picking blueberries	429
-playing road hockey	430
-carving wood with a knife	431
-slicing onion	432
-saluting	433
-letting go of balloon	434
-breaking glass	435
-carrying weight	436
-mixing colours	437
-moving baby	438
-blending fruit	439
-pouring milk	440
-surveying	441
-making slime	442
-sieving	443
-walking with crutches	444
-flipping bottle	445
-playing billiards	446
-arresting	447
-listening with headphones	448
-spinning plates	449
-carving marble	450
-cutting cake	451
-shoot dance	452
-being excited	453
-petting horse	454
-splashing water	455
-filling cake	456
-stacking dice	457
-checking watch	458
-treating wood	459
-laying decking	460
-shooting off fireworks	461
-pouring wine	462
-pretending to be a statue	463
-steering car	464
-playing rounders	465
-looking in mirror	466
-jumping sofa	467
-lighting candle	468
-walking on stilts	469
-crocheting	470
-playing piccolo	471
-vacuuming car	472
-high fiving	473
-playing shuffleboard	474
-chasing	475
-pulling rope (game)	476
-being in zero gravity	477
-sanding wood	478
-decoupage	479
-using megaphone	480
-making latte art	481
-ski ballet	482
-playing oboe	483
-bouncing ball (not juggling)	484
-playing mahjong	485
-herding cattle	486
-swimming with sharks	487
-milking goat	488
-swimming with dolphins	489
-metal detecting	490
-playing slot machine	491
-polishing metal	492
-throwing tantrum	493
-lawn mower racing	494
-laying stone	495
-cutting orange	496
-skipping stone	497
-pouring beer	498
-making bubbles	499
-jaywalking	500
-leatherworking	501
-card stacking	502
-putting on eyeliner	503
-card throwing	504
-chewing gum	505
-falling off bike	506
-repairing puncture	507
-dumpster diving	508
-tiptoeing	509
-sleeping	510
-using circular saw	511
-cracking knuckles	512
-pinching	513
-chiseling wood	514
-playing rubiks cube	515
-weaving fabric	516
-fencing (sport)	517
-sword swallowing	518
-lighting fire	519
-vacuuming floor	520
-combing hair	521
-building lego	522
-playing pinball	523
-fly tying	524
-playing lute	525
-opening door	526
-waving hand	527
-rolling pastry	528
-chiseling stone	529
-threading needle	530
-playing dominoes	531
-opening wine bottle	532
-playing with trains	533
-steer roping	534
-playing field hockey	535
-separating eggs	536
-sewing	537
-talking on cell phone	538
-needle felting	539
-pushing wheelbarrow	540
-using a paint roller	541
-playing netball	542
-lifting hat	543
-massaging neck	544
-blowing bubble gum	545
-walking through snow	546
-docking boat	547
-clam digging	548
-marriage proposal	549
-packing	550
-sausage making	551
-licking	552
-scrapbooking	553
-flint knapping	554
-lock picking	555
-putting on lipstick	556
-sawing wood	557
-playing hand clapping games	558
-geocaching	559
-looking at phone	560
-making cheese	561
-poking bellybutton	562
-contorting	563
-fixing bicycle	564
-using a microscope	565
-using a wrench	566
-doing jigsaw puzzle	567
-making horseshoes	568
-cooking scallops	569
-square dancing	570
-getting a piercing	571
-playing ocarina	572
-making paper aeroplanes	573
-playing scrabble	574
-visiting the zoo	575
-crossing eyes	576
-jumping bicycle	577
-throwing water balloon	578
-bodysurfing	579
-pirouetting	580
-luge	581
-spelunking	582
-watching tv	583
-attending conference	584
-curling (sport)	585
-directing traffic	586
-swimming front crawl	587
-ice swimming	588
-battle rope training	589
-putting on mascara	590
-bouncing on bouncy castle	591
-smoking pipe	592
-pillow fight	593
-putting on sari	594
-calligraphy	595
-roasting pig	596
-cracking back	597
-shopping	598
-burping	599
-using bagging machine	600
-staring	601
-shucking oysters	602
-blowdrying hair	603
-smashing	604
-playing laser tag	605
-wading through mud	606
-rope pushdown	607
-preparing salad	608
-making balloon shapes	609
-tagging graffiti	610
-adjusting glasses	611
-using a power drill	612
-trimming shrubs	613
-popping balloons	614
-playing pan pipes	615
-using puppets	616
-arguing	617
-backflip (human)	618
-riding snow blower	619
-hand washing clothes	620
-calculating	621
-gospel singing in church	622
-standing on hands	623
-tasting wine	624
-shaping bread dough	625
-wading through water	626
-falling off chair	627
-throwing snowballs	628
-building sandcastle	629
-land sailing	630
-tying shoe laces	631
-jumping jacks	632
-wood burning (art)	633
-putting on foundation	634
-putting on shoes	635
-cumbia	636
-archaeological excavation	637
-mountain climber (exercise)	638
-assembling bicycle	639
-head stand	640
-cutting apple	641
-shuffling feet	642
-bottling	643
-breathing fire	644
-using inhaler	645
-historical reenactment	646
-hugging baby	647
-mushroom foraging	648
-delivering mail	649
-laying tiles	650
-using atm	651
-chopping meat	652
-tightrope walking	653
-mosh pit dancing	654
-photobombing	655
-coloring in	656
-huddling	657
-playing gong	658
-laying concrete	659
-breaking boards	660
-acting in play	661
-base jumping	662
-tie dying	663
-using a sledge hammer	664
-playing ping pong	665
-photocopying	666
-winking	667
-waking up	668
-swinging baseball bat	669
-twiddling fingers	670
-playing polo	671
-longboarding	672
-ironing hair	673
-bathing dog	674
-moon walking	675
-playing marbles	676
-embroidering	677
-playing beer pong	678
-home roasting coffee	679
-gold panning	680
-karaoke	681
-changing gear in car	682
-raising eyebrows	683
-yarn spinning	684
-scrubbing face	685
-fidgeting	686
-planing wood	687
-cosplaying	688
-capsizing	689
-tackling	690
-shining flashlight	691
-dyeing eyebrows	692
-drooling	693
-alligator wrestling	694
-playing blackjack	695
-carving ice	696
-playing maracas	697
-opening refrigerator	698
-throwing knife	699
-putting in contact lenses	700
-passing soccer ball	701
-casting fishing line	702
-sucking lolly	703
-installing carpet	704
-bulldozing	705
-roasting marshmallows	706
-playing darts	707
-chopping vegetables	708
-bull fighting	709
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k400.json b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json
index f97fa4d49f..5d1ee4cbba 100644
--- a/configs/recognition/uniformerv2/k710_channel_map/map_k400.json
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json
@@ -1 +1 @@
-[341, 158, 189, 16, 398, 302, 202, 318, 80, 323, 249, 315, 18, 88, 365, 52, 257, 103, 113, 162, 75, 338, 388, 352, 308, 125, 159, 82, 10, 44, 92, 396, 185, 258, 383, 178, 71, 260, 15, 335, 192, 326, 58, 133, 172, 120, 334, 280, 306, 101, 337, 173, 203, 356, 4, 209, 332, 7, 65, 115, 95, 81, 232, 344, 303, 201, 342, 351, 165, 397, 252, 368, 285, 244, 363, 355, 79, 268, 110, 343, 72, 219, 321, 208, 345, 340, 84, 61, 206, 188, 62, 55, 29, 237, 2, 286, 245, 90, 8, 372, 325, 380, 226, 274, 346, 354, 97, 28, 246, 194, 212, 26, 281, 147, 215, 264, 30, 14, 301, 275, 66, 265, 224, 104, 121, 357, 117, 54, 107, 279, 109, 122, 289, 78, 59, 241, 179, 291, 349, 142, 152, 220, 311, 386, 145, 239, 392, 99, 266, 100, 176, 314, 167, 64, 160, 216, 49, 207, 222, 184, 171, 22, 234, 148, 339, 218, 294, 324, 233, 262, 9, 377, 41, 390, 53, 150, 361, 73, 247, 96, 60, 364, 298, 70, 395, 143, 236, 336, 196, 385, 33, 144, 1, 307, 393, 256, 263, 375, 235, 273, 243, 106, 366, 271, 186, 287, 51, 299, 175, 276, 369, 57, 11, 373, 35, 163, 297, 195, 399, 290, 382, 319, 134, 40, 310, 223, 151, 270, 3, 387, 137, 31, 309, 217, 17, 374, 190, 277, 327, 135, 394, 50, 284, 177, 67, 379, 141, 353, 108, 37, 136, 197, 272, 21, 312, 213, 164, 182, 250, 91, 89, 253, 199, 333, 248, 63, 119, 0, 130, 102, 32, 227, 362, 296, 23, 47, 156, 180, 183, 313, 5, 350, 389, 328, 112, 93, 378, 359, 83, 282, 174, 371, 48, 360, 24, 376, 68, 42, 221, 140, 181, 118, 116, 381, 94, 77, 27, 45, 87, 230, 292, 76, 39, 169, 131, 19, 126, 367, 105, 114, 193, 210, 305, 149, 98, 259, 200, 12, 320, 254, 146, 278, 242, 261, 36, 293, 251, 214, 25, 304, 204, 157, 255, 111, 229, 283, 128, 161, 170, 86, 74, 138, 6, 198, 384, 187, 155, 348, 154, 166, 124, 205, 132, 13, 34, 225, 43, 347, 228, 358, 38, 127, 231, 316, 269, 288, 139, 168, 46, 238, 317, 69, 211, 123, 391, 330, 295, 322, 329, 129, 240, 153, 267, 85, 300, 20, 191, 56, 370, 331]
+[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399]
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k600.json b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json
index f0d3b1b0e9..64bdcd0f55 100644
--- a/configs/recognition/uniformerv2/k710_channel_map/map_k600.json
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json
@@ -1 +1 @@
-[341, 661, 611, 158, 694, 189, 16, 398, 637, 302, 617, 202, 318, 639, 80, 584, 323, 618, 315, 88, 365, 52, 662, 674, 589, 257, 103, 113, 162, 75, 338, 388, 352, 603, 545, 308, 125, 159, 82, 10, 579, 44, 643, 591, 92, 396, 185, 258, 383, 660, 644, 178, 71, 260, 15, 522, 629, 335, 709, 705, 192, 599, 326, 621, 595, 58, 133, 689, 502, 504, 172, 120, 696, 334, 702, 280, 306, 101, 337, 173, 682, 203, 356, 4, 209, 505, 529, 514, 652, 708, 332, 548, 7, 65, 115, 81, 232, 344, 303, 201, 342, 351, 165, 656, 521, 397, 563, 368, 285, 244, 569, 688, 363, 355, 597, 512, 79, 268, 576, 110, 343, 636, 585, 72, 641, 219, 496, 321, 208, 345, 340, 84, 61, 206, 188, 649, 55, 586, 29, 237, 547, 2, 286, 567, 245, 90, 8, 372, 226, 274, 346, 693, 354, 97, 508, 28, 692, 246, 194, 212, 26, 281, 147, 215, 264, 30, 14, 301, 677, 66, 265, 224, 506, 627, 104, 121, 357, 517, 686, 54, 564, 107, 554, 279, 524, 109, 122, 289, 78, 59, 241, 559, 349, 571, 142, 152, 680, 220, 311, 386, 622, 145, 239, 392, 99, 266, 620, 640, 100, 176, 314, 167, 646, 64, 160, 216, 679, 49, 207, 657, 222, 647, 184, 171, 22, 234, 148, 339, 588, 18, 704, 218, 673, 294, 500, 324, 233, 262, 9, 377, 577, 41, 632, 390, 681, 53, 150, 361, 73, 247, 96, 630, 60, 494, 364, 659, 495, 650, 501, 552, 543, 519, 555, 298, 672, 560, 581, 70, 395, 143, 609, 499, 561, 568, 336, 573, 196, 385, 33, 144, 236, 1, 549, 307, 393, 256, 544, 263, 375, 675, 235, 654, 273, 638, 243, 106, 648, 539, 366, 271, 526, 186, 698, 532, 550, 287, 51, 299, 175, 276, 701, 369, 57, 179, 11, 373, 655, 666, 35, 593, 513, 580, 687, 163, 297, 195, 399, 290, 382, 319, 678, 695, 40, 310, 223, 151, 270, 3, 707, 387, 531, 137, 535, 31, 658, 309, 558, 217, 17, 374, 190, 277, 605, 525, 697, 676, 327, 542, 572, 135, 394, 615, 50, 523, 665, 284, 671, 177, 515, 67, 574, 379, 141, 353, 108, 37, 136, 197, 533, 272, 562, 21, 492, 614, 498, 608, 312, 213, 164, 182, 250, 91, 89, 253, 199, 540, 333, 700, 503, 634, 556, 590, 594, 635, 683, 248, 63, 119, 507, 0, 130, 102, 32, 362, 296, 23, 619, 47, 156, 706, 596, 180, 183, 313, 5, 528, 607, 350, 389, 328, 112, 551, 557, 93, 553, 685, 378, 536, 359, 537, 83, 282, 625, 174, 371, 48, 360, 24, 691, 376, 68, 42, 598, 221, 140, 602, 118, 642, 116, 381, 94, 325, 77, 27, 45, 230, 87, 292, 76, 497, 39, 169, 131, 19, 510, 604, 193, 126, 367, 592, 105, 114, 210, 305, 149, 98, 259, 582, 200, 12, 254, 570, 146, 623, 601, 534, 278, 242, 261, 36, 703, 251, 214, 25, 304, 204, 157, 587, 255, 669, 229, 283, 518, 690, 610, 128, 538, 170, 86, 74, 138, 6, 198, 624, 384, 187, 530, 155, 348, 154, 699, 628, 493, 578, 166, 663, 653, 509, 124, 205, 13, 34, 225, 613, 43, 347, 670, 228, 358, 38, 631, 127, 231, 565, 541, 612, 664, 566, 651, 600, 511, 645, 616, 269, 288, 520, 575, 606, 626, 168, 668, 46, 546, 238, 317, 69, 211, 583, 123, 391, 330, 527, 295, 322, 329, 129, 240, 516, 153, 267, 85, 667, 633, 300, 20, 191, 684, 56, 370, 331]
+[0, 661, 611, 1, 694, 2, 3, 4, 637, 5, 617, 6, 7, 639, 8, 584, 9, 618, 11, 13, 14, 15, 662, 674, 589, 16, 17, 18, 19, 20, 21, 22, 23, 603, 545, 24, 25, 26, 27, 28, 579, 29, 643, 591, 30, 31, 32, 33, 34, 660, 644, 35, 36, 37, 38, 522, 629, 39, 709, 705, 40, 599, 41, 621, 595, 42, 43, 689, 502, 504, 44, 45, 696, 46, 702, 47, 48, 49, 50, 51, 682, 52, 53, 54, 55, 505, 529, 514, 652, 708, 56, 548, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 656, 521, 69, 563, 71, 72, 73, 569, 688, 74, 75, 597, 512, 76, 77, 576, 78, 79, 636, 585, 80, 641, 81, 496, 82, 83, 84, 85, 86, 87, 88, 89, 649, 91, 586, 92, 93, 547, 94, 95, 567, 96, 97, 98, 99, 102, 103, 104, 693, 105, 106, 508, 107, 692, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 677, 120, 121, 122, 506, 627, 123, 124, 125, 517, 686, 127, 564, 128, 554, 129, 524, 130, 131, 132, 133, 134, 135, 559, 138, 571, 139, 140, 680, 141, 142, 143, 622, 144, 145, 146, 147, 148, 620, 640, 149, 150, 151, 152, 646, 153, 154, 155, 679, 156, 157, 657, 158, 647, 159, 160, 161, 162, 163, 164, 588, 12, 704, 165, 673, 166, 500, 167, 168, 169, 170, 171, 577, 172, 632, 173, 681, 174, 175, 176, 177, 178, 179, 630, 180, 494, 181, 659, 495, 650, 501, 552, 543, 519, 555, 182, 672, 560, 581, 183, 184, 185, 609, 499, 561, 568, 187, 573, 188, 189, 190, 191, 186, 192, 549, 193, 194, 195, 544, 196, 197, 675, 198, 654, 199, 638, 200, 201, 648, 539, 202, 203, 526, 204, 698, 532, 550, 205, 206, 207, 208, 209, 701, 210, 211, 136, 212, 213, 655, 666, 214, 593, 513, 580, 687, 215, 216, 217, 218, 219, 220, 221, 678, 695, 223, 224, 225, 226, 227, 228, 707, 229, 531, 230, 535, 231, 658, 232, 558, 233, 234, 235, 236, 237, 605, 525, 697, 676, 238, 542, 572, 239, 240, 615, 241, 523, 665, 242, 671, 243, 515, 244, 574, 245, 246, 247, 248, 249, 250, 251, 533, 252, 562, 253, 492, 614, 498, 608, 254, 255, 256, 257, 258, 259, 260, 261, 262, 540, 263, 700, 503, 634, 556, 590, 594, 635, 683, 264, 265, 266, 507, 267, 268, 269, 270, 272, 273, 274, 619, 275, 276, 706, 596, 277, 278, 279, 280, 528, 607, 281, 282, 283, 284, 551, 557, 285, 553, 685, 286, 536, 287, 537, 288, 289, 625, 290, 291, 292, 293, 294, 691, 295, 296, 297, 598, 298, 299, 602, 301, 642, 302, 303, 304, 100, 305, 306, 307, 309, 308, 310, 311, 497, 312, 313, 314, 315, 510, 604, 320, 316, 317, 592, 318, 319, 321, 322, 323, 324, 325, 582, 326, 327, 329, 570, 330, 623, 601, 534, 331, 332, 333, 334, 703, 336, 337, 338, 339, 340, 341, 587, 342, 669, 344, 345, 518, 690, 610, 346, 538, 348, 349, 350, 351, 352, 353, 624, 354, 355, 530, 356, 357, 358, 699, 628, 493, 578, 359, 663, 653, 509, 360, 361, 363, 364, 365, 613, 366, 367, 670, 368, 369, 370, 631, 371, 372, 565, 541, 612, 664, 566, 651, 600, 511, 645, 616, 374, 375, 520, 575, 606, 626, 377, 668, 378, 546, 379, 380, 381, 382, 583, 383, 384, 385, 527, 386, 387, 388, 389, 390, 516, 391, 392, 393, 667, 633, 394, 395, 396, 684, 397, 398, 399]
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k700.json b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json
index 784fa00f71..02bf3c4044 100644
--- a/configs/recognition/uniformerv2/k710_channel_map/map_k700.json
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json
@@ -1 +1 @@
-[341, 661, 611, 158, 694, 189, 16, 398, 637, 302, 617, 202, 318, 447, 639, 80, 584, 323, 249, 618, 315, 88, 365, 52, 662, 674, 589, 257, 103, 453, 477, 113, 162, 75, 338, 388, 352, 439, 603, 545, 308, 125, 159, 82, 10, 579, 44, 643, 484, 591, 92, 396, 185, 258, 383, 660, 435, 644, 178, 419, 71, 260, 15, 522, 629, 335, 705, 192, 599, 326, 621, 595, 58, 133, 689, 502, 504, 172, 436, 120, 696, 450, 334, 431, 702, 280, 306, 101, 337, 173, 682, 203, 356, 475, 4, 458, 209, 505, 529, 514, 652, 332, 548, 7, 65, 115, 81, 232, 344, 303, 201, 342, 351, 165, 403, 656, 521, 397, 563, 252, 368, 285, 244, 569, 688, 406, 363, 355, 597, 512, 79, 268, 470, 576, 110, 343, 636, 585, 418, 72, 641, 451, 219, 496, 321, 208, 345, 340, 84, 61, 206, 415, 188, 479, 649, 62, 55, 586, 29, 237, 547, 2, 286, 567, 245, 90, 405, 8, 372, 226, 274, 346, 693, 354, 97, 508, 28, 692, 246, 194, 212, 26, 281, 147, 215, 264, 409, 30, 14, 301, 677, 402, 275, 66, 265, 224, 506, 627, 104, 121, 357, 517, 686, 456, 117, 54, 564, 107, 554, 445, 279, 524, 109, 122, 289, 78, 59, 241, 291, 559, 349, 571, 142, 152, 680, 220, 311, 386, 622, 145, 422, 239, 392, 99, 266, 620, 640, 100, 176, 404, 486, 473, 314, 167, 646, 64, 160, 216, 679, 49, 207, 657, 222, 647, 184, 171, 22, 234, 148, 339, 588, 18, 704, 218, 673, 294, 500, 324, 233, 262, 9, 377, 577, 41, 632, 467, 390, 681, 53, 150, 361, 73, 247, 96, 630, 60, 494, 364, 659, 460, 495, 650, 501, 434, 552, 543, 468, 519, 448, 555, 298, 672, 560, 466, 581, 70, 395, 143, 609, 499, 561, 568, 336, 481, 573, 196, 442, 385, 33, 144, 236, 1, 549, 307, 393, 256, 544, 263, 490, 375, 488, 437, 675, 235, 654, 273, 638, 438, 424, 243, 106, 648, 539, 366, 271, 427, 526, 186, 698, 532, 550, 287, 51, 299, 175, 276, 701, 369, 408, 57, 179, 11, 373, 454, 655, 666, 35, 429, 593, 513, 580, 687, 163, 297, 195, 421, 399, 290, 382, 319, 678, 446, 695, 134, 40, 423, 310, 223, 151, 270, 3, 707, 387, 531, 137, 535, 31, 658, 309, 558, 217, 17, 374, 190, 277, 605, 525, 485, 697, 676, 327, 542, 401, 483, 572, 135, 394, 615, 50, 471, 523, 665, 284, 671, 177, 430, 465, 515, 67, 574, 474, 491, 379, 141, 353, 108, 37, 136, 197, 533, 272, 400, 562, 21, 413, 492, 614, 498, 440, 462, 608, 312, 463, 213, 420, 476, 164, 182, 250, 91, 89, 253, 199, 540, 333, 700, 503, 634, 556, 590, 594, 635, 416, 683, 248, 63, 119, 507, 0, 130, 102, 32, 362, 296, 23, 619, 47, 156, 706, 596, 180, 183, 313, 5, 428, 528, 607, 350, 389, 328, 433, 112, 478, 551, 557, 93, 553, 685, 378, 407, 536, 359, 537, 83, 282, 625, 174, 371, 48, 360, 24, 691, 376, 452, 68, 42, 461, 598, 221, 411, 140, 181, 602, 118, 642, 116, 443, 381, 412, 94, 325, 77, 27, 482, 45, 230, 87, 292, 76, 497, 39, 169, 131, 19, 510, 432, 604, 193, 126, 367, 592, 105, 114, 210, 305, 149, 98, 259, 582, 449, 200, 455, 12, 320, 254, 570, 146, 426, 425, 457, 623, 601, 534, 464, 278, 242, 261, 36, 703, 251, 214, 441, 25, 304, 204, 157, 587, 489, 487, 255, 669, 229, 283, 518, 690, 610, 128, 414, 538, 170, 86, 74, 138, 6, 198, 624, 384, 187, 530, 155, 348, 154, 699, 628, 493, 578, 166, 663, 653, 509, 124, 205, 132, 13, 34, 459, 225, 613, 43, 347, 670, 228, 358, 38, 631, 127, 417, 231, 565, 541, 612, 664, 566, 651, 600, 511, 645, 480, 616, 269, 288, 472, 520, 575, 606, 626, 168, 668, 469, 46, 546, 444, 238, 317, 69, 211, 583, 123, 391, 330, 527, 410, 295, 322, 329, 129, 240, 516, 153, 267, 85, 667, 633, 300, 20, 191, 684, 56, 370, 331]
+[0, 661, 611, 1, 694, 2, 3, 4, 637, 5, 617, 6, 7, 447, 639, 8, 584, 9, 10, 618, 11, 13, 14, 15, 662, 674, 589, 16, 17, 453, 477, 18, 19, 20, 21, 22, 23, 439, 603, 545, 24, 25, 26, 27, 28, 579, 29, 643, 484, 591, 30, 31, 32, 33, 34, 660, 435, 644, 35, 419, 36, 37, 38, 522, 629, 39, 705, 40, 599, 41, 621, 595, 42, 43, 689, 502, 504, 44, 436, 45, 696, 450, 46, 431, 702, 47, 48, 49, 50, 51, 682, 52, 53, 475, 54, 458, 55, 505, 529, 514, 652, 56, 548, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 403, 656, 521, 69, 563, 70, 71, 72, 73, 569, 688, 406, 74, 75, 597, 512, 76, 77, 470, 576, 78, 79, 636, 585, 418, 80, 641, 451, 81, 496, 82, 83, 84, 85, 86, 87, 88, 415, 89, 479, 649, 90, 91, 586, 92, 93, 547, 94, 95, 567, 96, 97, 405, 98, 99, 102, 103, 104, 693, 105, 106, 508, 107, 692, 108, 109, 110, 111, 112, 113, 114, 115, 409, 116, 117, 118, 677, 402, 119, 120, 121, 122, 506, 627, 123, 124, 125, 517, 686, 456, 126, 127, 564, 128, 554, 445, 129, 524, 130, 131, 132, 133, 134, 135, 137, 559, 138, 571, 139, 140, 680, 141, 142, 143, 622, 144, 422, 145, 146, 147, 148, 620, 640, 149, 150, 404, 486, 473, 151, 152, 646, 153, 154, 155, 679, 156, 157, 657, 158, 647, 159, 160, 161, 162, 163, 164, 588, 12, 704, 165, 673, 166, 500, 167, 168, 169, 170, 171, 577, 172, 632, 467, 173, 681, 174, 175, 176, 177, 178, 179, 630, 180, 494, 181, 659, 460, 495, 650, 501, 434, 552, 543, 468, 519, 448, 555, 182, 672, 560, 466, 581, 183, 184, 185, 609, 499, 561, 568, 187, 481, 573, 188, 442, 189, 190, 191, 186, 192, 549, 193, 194, 195, 544, 196, 490, 197, 488, 437, 675, 198, 654, 199, 638, 438, 424, 200, 201, 648, 539, 202, 203, 427, 526, 204, 698, 532, 550, 205, 206, 207, 208, 209, 701, 210, 408, 211, 136, 212, 213, 454, 655, 666, 214, 429, 593, 513, 580, 687, 215, 216, 217, 421, 218, 219, 220, 221, 678, 446, 695, 222, 223, 423, 224, 225, 226, 227, 228, 707, 229, 531, 230, 535, 231, 658, 232, 558, 233, 234, 235, 236, 237, 605, 525, 485, 697, 676, 238, 542, 401, 483, 572, 239, 240, 615, 241, 471, 523, 665, 242, 671, 243, 430, 465, 515, 244, 574, 474, 491, 245, 246, 247, 248, 249, 250, 251, 533, 252, 400, 562, 253, 413, 492, 614, 498, 440, 462, 608, 254, 463, 255, 420, 476, 256, 257, 258, 259, 260, 261, 262, 540, 263, 700, 503, 634, 556, 590, 594, 635, 416, 683, 264, 265, 266, 507, 267, 268, 269, 270, 272, 273, 274, 619, 275, 276, 706, 596, 277, 278, 279, 280, 428, 528, 607, 281, 282, 283, 433, 284, 478, 551, 557, 285, 553, 685, 286, 407, 536, 287, 537, 288, 289, 625, 290, 291, 292, 293, 294, 691, 295, 452, 296, 297, 461, 598, 298, 411, 299, 300, 602, 301, 642, 302, 443, 303, 412, 304, 100, 305, 306, 482, 307, 309, 308, 310, 311, 497, 312, 313, 314, 315, 510, 432, 604, 320, 316, 317, 592, 318, 319, 321, 322, 323, 324, 325, 582, 449, 326, 455, 327, 328, 329, 570, 330, 426, 425, 457, 623, 601, 534, 464, 331, 332, 333, 334, 703, 336, 337, 441, 338, 339, 340, 341, 587, 489, 487, 342, 669, 344, 345, 518, 690, 610, 346, 414, 538, 348, 349, 350, 351, 352, 353, 624, 354, 355, 530, 356, 357, 358, 699, 628, 493, 578, 359, 663, 653, 509, 360, 361, 362, 363, 364, 459, 365, 613, 366, 367, 670, 368, 369, 370, 631, 371, 417, 372, 565, 541, 612, 664, 566, 651, 600, 511, 645, 480, 616, 374, 375, 472, 520, 575, 606, 626, 377, 668, 469, 378, 546, 444, 379, 380, 381, 382, 583, 383, 384, 385, 527, 410, 386, 387, 388, 389, 390, 516, 391, 392, 393, 667, 633, 394, 395, 396, 684, 397, 398, 399]
diff --git a/configs/recognition/uniformerv2/metafile.yml b/configs/recognition/uniformerv2/metafile.yml
index 95f3a4990e..bcff2d8d70 100644
--- a/configs/recognition/uniformerv2/metafile.yml
+++ b/configs/recognition/uniformerv2/metafile.yml
@@ -372,7 +372,8 @@ Models:
     Converted From:
       Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
       Code: https://github.com/OpenGVLab/UniFormerV2
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20230612-63cdbad9.pth
 
   - Name: uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb
     Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py
@@ -386,7 +387,7 @@ Models:
     Converted From:
       Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
       Code: https://github.com/OpenGVLab/UniFormerV2
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20221219-bfaae587.pth
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20230612-d002a407.pth
 
   - Name: uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb
     Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py
@@ -400,7 +401,7 @@ Models:
     Converted From:
       Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
       Code: https://github.com/OpenGVLab/UniFormerV2
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20230612-d723ddc1.pth
 
   - Name: uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb
     Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py
new file mode 100644
index 0000000000..72dada4766
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py
@@ -0,0 +1,209 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type='TimeSformerHead',
+        dropout_ratio=0.5,
+        num_classes=710,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+# dataset settings
+k400_data_root = 'data/kinetics400/videos_train'
+k600_data_root = 'data/kinetics600/videos'
+k700_data_root = 'data/kinetics700/videos'
+k400_data_root_val = 'data/kinetics400/videos_val'
+k600_data_root_val = k600_data_root
+k700_data_root_val = k700_data_root
+
+k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt'
+k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt'
+k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt'
+
+k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt'
+k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt'
+k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt'
+
+k400_trainset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_train,
+    data_prefix=dict(video=k400_data_root),
+    pipeline=train_pipeline)
+k600_trainset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_train,
+    data_prefix=dict(video=k600_data_root),
+    pipeline=train_pipeline)
+k700_trainset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_train,
+    data_prefix=dict(video=k700_data_root),
+    pipeline=train_pipeline)
+
+k400_valset = dict(
+    type='VideoDataset',
+    ann_file=k400_ann_file_val,
+    data_prefix=dict(video=k400_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k600_valset = dict(
+    type='VideoDataset',
+    ann_file=k600_ann_file_val,
+    data_prefix=dict(video=k600_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k700_valset = dict(
+    type='VideoDataset',
+    ann_file=k700_ann_file_val,
+    data_prefix=dict(video=k700_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+
+k400_testset = k400_valset.copy()
+k600_testset = k600_valset.copy()
+k700_testset = k700_valset.copy()
+k400_testset['pipeline'] = test_pipeline
+k600_testset['pipeline'] = test_pipeline
+k700_testset['pipeline'] = test_pipeline
+
+k710_trainset = dict(
+    type='ConcatDataset',
+    datasets=[k400_trainset, k600_trainset, k700_trainset])
+k710_valset = dict(
+    type='ConcatDataset', datasets=[k400_valset, k600_valset, k700_valset])
+k710_testset = dict(
+    type='ConcatDataset',
+    datasets=[k400_testset, k600_testset, k700_testset],
+)
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=k710_trainset)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_valset)
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=k710_testset)
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=50,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
index 7842bf1164..4bd6537603 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
@@ -1,4 +1,4 @@
-_base_ = ['../../configs/_base_/default_runtime.py']
+_base_ = ['../../_base_/default_runtime.py']
 
 # model settings
 num_frames = 8
diff --git a/configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py b/configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py
index 1733dc289f..7392d290fb 100644
--- a/configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py
+++ b/configs/recognition/videomae/vit-large-p16_videomae-k400-pre_16x4x1_kinetics-400.py
@@ -1,4 +1,4 @@
-_base_ = ['vit-base_videomae-k400-pre_16x4x1_kinetics-400.py']
+_base_ = ['vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py']
 
 # model settings
 model = dict(
diff --git a/configs/recognition/videomaev2/README.md b/configs/recognition/videomaev2/README.md
new file mode 100644
index 0000000000..3686950c1c
--- /dev/null
+++ b/configs/recognition/videomaev2/README.md
@@ -0,0 +1,63 @@
+# VideoMAE V2
+
+[VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking](https://arxiv.org/abs/2303.16727)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Scale is the primary factor for building a powerful foundation model that could well generalize to a variety of downstream tasks. However, it is still challenging to train video foundation models with billions of parameters. This paper shows that video masked autoencoder (VideoMAE) is a scalable and general self-supervised pre-trainer for building video foundation models. We scale the VideoMAE in both model and data with a core design. Specifically, we present a dual masking strategy for efficient pre-training, with an encoder operating on a subset of video tokens and a decoder processing another subset of video tokens. Although VideoMAE is very efficient due to high masking ratio in encoder, masking decoder can still further reduce the overall computational cost. This enables the efficient pre-training of billion-level models in video. We also use a progressive training paradigm that involves an initial pre-training on a diverse multi-sourced unlabeled dataset, followed by a post-pre-training on a mixed labeled dataset. Finally, we successfully train a video ViT model with a billion parameters, which achieves a new state-of-the-art performance on the datasets of Kinetics (90.0% on K400 and 89.9% on K600) and Something-Something (68.7% on V1 and 77.0% on V2). In addition, we extensively verify the pre-trained video ViT models on a variety of downstream tasks, demonstrating its effectiveness as a general video representation learner.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/35596075/237352561-6204d743-8705-43f5-817f-0bc4907b88d0.png" width="800"/>
+</div>
+
+## Results and Models
+
+### Kinetics-400
+
+| frame sampling strategy |   resolution   | backbone | top1 acc | top5 acc |         reference top1 acc         |         reference top5 acc         | testing protocol  | FLOPs | params |         config         |         ckpt          |
+| :---------------------: | :------------: | :------: | :------: | :------: | :--------------------------------: | :--------------------------------: | :---------------: | :---: | :----: | :--------------------: | :-------------------: |
+|         16x4x1          | short-side 320 |  ViT-S   |   83.6   |   96.3   | 83.7 \[[VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md)\] | 96.2 \[[VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md)\] | 5 clips x 3 crops |  57G  |  22M   | [config](/configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-25c748fd.pth) \[1\] |
+|         16x4x1          | short-side 320 |  ViT-B   |   86.6   |   97.3   | 86.6 \[[VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md)\] | 97.3 \[[VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md)\] | 5 clips x 3 crops | 180G  |  87M   | [config](/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth) \[1\] |
+
+\[1\] The models were distilled from the VideoMAE V2-g model. Specifically, models are initialized with VideoMAE V2 pretraining, then distilled on Kinetics 710 dataset. They are ported from the repo [VideoMAE V2](https://github.com/OpenGVLab/VideoMAEv2) and tested on our data. The VideoMAE V2-g model can be obtained from the original repository. Currently, we only support the testing of VideoMAE V2 models.
+
+1. The values in columns named after "reference" are the results of the original repo.
+2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
+
+For more details on data preparation, you can refer to [preparing_kinetics](/tools/data/kinetics/README.md).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test ViT-base model on Kinetics-400 dataset and dump the result to a pkl file.
+
+```shell
+python tools/test.py configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py \
+    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Citation
+
+```BibTeX
+@misc{wang2023videomaev2,
+      title={VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking},
+      author={Limin Wang and Bingkun Huang and Zhiyu Zhao and Zhan Tong and Yinan He and Yi Wang and Yali Wang and Yu Qiao},
+      year={2023},
+      eprint={2303.16727},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/configs/recognition/videomaev2/metafile.yml b/configs/recognition/videomaev2/metafile.yml
new file mode 100644
index 0000000000..463b3c360f
--- /dev/null
+++ b/configs/recognition/videomaev2/metafile.yml
@@ -0,0 +1,43 @@
+Collections:
+- Name: VideoMAEv2
+  README: configs/recognition/videomaev2/README.md
+  Paper:
+    URL: https://arxiv.org/abs/2303.16727
+    Title: "VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking"
+
+Models:
+  - Name: vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400
+    Config: configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py
+    In Collection: VideoMAEv2
+    Metadata:
+      Architecture: ViT-S
+      Resolution: short-side 320
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/VideoMAEv2/
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 83.6
+        Top 5 Accuracy: 96.3
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth
+
+  - Name: vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400
+    Config: configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py
+    In Collection: VideoMAEv2
+    Metadata:
+      Architecture: ViT-B
+      Resolution: short-side 320
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/VideoMAEv2/
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 86.6
+        Top 5 Accuracy: 97.3
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth
diff --git a/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py b/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py
new file mode 100644
index 0000000000..d6f6e26a5f
--- /dev/null
+++ b/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py
@@ -0,0 +1,61 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=224,
+        patch_size=16,
+        embed_dims=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        num_frames=16,
+        norm_cfg=dict(type='LN', eps=1e-6)),
+    cls_head=dict(
+        type='TimeSformerHead',
+        num_classes=400,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=16,
+        frame_interval=4,
+        num_clips=5,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = dict(type='AccMetric')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py b/configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py
new file mode 100644
index 0000000000..e4d94d1cc3
--- /dev/null
+++ b/configs/recognition/videomaev2/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py
@@ -0,0 +1,6 @@
+_base_ = ['vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py']
+
+# model settings
+model = dict(
+    backbone=dict(embed_dims=384, depth=12, num_heads=6),
+    cls_head=dict(in_channels=384))
diff --git a/configs/recognition_audio/resnet/README.md b/configs/recognition_audio/resnet/README.md
index f74f5c6ccc..f6386e313f 100644
--- a/configs/recognition_audio/resnet/README.md
+++ b/configs/recognition_audio/resnet/README.md
@@ -29,7 +29,7 @@ We present Audiovisual SlowFast Networks, an architecture for integrated audiovi
    e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
 
-For more details on data preparation, you can refer to `Prepare audio` in [Data Preparation Tutorial](/docs/en/user_guides/2_data_prepare.md).
+For more details on data preparation, you can refer to `Prepare audio` in [Data Preparation Tutorial](/docs/en/user_guides/prepare_dataset.md).
 
 ## Train
 
diff --git a/configs/retrieval/clip4clip/README.md b/configs/retrieval/clip4clip/README.md
new file mode 100644
index 0000000000..e898fbb324
--- /dev/null
+++ b/configs/retrieval/clip4clip/README.md
@@ -0,0 +1,74 @@
+# CLIP4Clip
+
+[CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval](https://arxiv.org/abs/2104.08860)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Video-text retrieval plays an essential role in multi-modal research and has been widely used in many real-world web applications. The CLIP (Contrastive Language-Image Pre-training), an image-language pre-training model, has demonstrated the power of visual concepts learning from web collected image-text datasets. In this paper, we propose a CLIP4Clip model to transfer the knowledge of the CLIP model to video-language retrieval in an end-to-end manner. Several questions are investigated via empirical studies: 1) Whether image feature is enough for video-text retrieval? 2) How a post-pretraining on a large-scale video-text dataset based on the CLIP affect the performance? 3) What is the practical mechanism to model temporal dependency between video frames? And 4) The Hyper-parameters sensitivity of the model on video-text retrieval task. Extensive experimental results present that the CLIP4Clip model transferred from the CLIP can achieve SOTA results on various video-text retrieval datasets, including MSR-VTT, MSVC, LSMDC, ActivityNet, and DiDeMo.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github.com/Dai-Wenxun/mmaction2/assets/58767402/f91fc927-d5f2-41dd-8198-def71d392991" width="800"/>
+</div>
+
+## Results and Models
+
+### MSRVTT-9k
+
+| frame sampling strategy | resolution | gpus | backbone | adapter | pretrain | Recall@1 | Recall@5 | Recall@10 | MdR | MnR  | testing protocol |              config              |              ckpt              |              log               |
+| :---------------------: | :--------: | :--: | :------: | :-----: | :------: | :------: | :------: | :-------: | :-: | :--: | :--------------: | :------------------------------: | :----------------------------: | :----------------------------: |
+|       uniform 12        |  224x224   |  8   | ViT-B/32 |  Mean   |   clip   |   43.1   |   69.4   |   78.9    | 2.0 | 16.8 | 1 clips x 1 crop | [config](/configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb_20230612-b9706e54.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.log) |
+
+For more details on data preparation, you can refer to [video_retrieval](/tools/data/video_retrieval/README.md).
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train CLIP4Clip model on MSRVTT-9k dataset in a deterministic option with periodic validation.
+
+```shell
+python tools/train.py configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py \
+    --seed 0 --deterministic
+```
+
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test CLIP4Clip model on MSRVTT-9k dataset and dump the result to a pkl file.
+
+```shell
+python tools/test.py configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Citation
+
+```BibTeX
+@article{luo2022clip4clip,
+  title={CLIP4Clip: An empirical study of CLIP for end to end video clip retrieval and captioning},
+  author={Luo, Huaishao and Ji, Lei and Zhong, Ming and Chen, Yang and Lei, Wen and Duan, Nan and Li, Tianrui},
+  journal={Neurocomputing},
+  volume={508},
+  pages={293--304},
+  year={2022},
+}
+```
diff --git a/configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py b/configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py
new file mode 100644
index 0000000000..3ad189a7f3
--- /dev/null
+++ b/configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py
@@ -0,0 +1,122 @@
+_base_ = '../../_base_/default_runtime.py'
+
+model = dict(
+    type='CLIPSimilarity',
+    clip_arch='ViT-B/32',
+    to_float32=True,
+    frozen_layers=0,
+    data_preprocessor=dict(
+        type='MultiModalDataPreprocessor',
+        preprocessors=dict(
+            imgs=dict(
+                type='ActionDataPreprocessor',
+                mean=[122.771, 116.746, 104.093],
+                std=[68.500, 66.632, 70.323],
+                format_shape='NCHW'),
+            text=dict(type='ActionDataPreprocessor', to_float32=False))),
+    adapter=dict(type='SimpleMeanAdapter'))
+
+dataset_type = 'VideoTextDataset'
+data_root = 'data/video_retrieval/msrvtt'
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=12, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='CLIPTokenize'),
+    dict(type='PackActionInputs', collect_keys=('imgs', 'text'))
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=12, num_clips=1, test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='CLIPTokenize'),
+    dict(type='PackActionInputs', collect_keys=('imgs', 'text'))
+]
+test_pipeline = val_pipeline
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='train_9k.json',
+        data_root=data_root,
+        data_prefix=dict(video='videos'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='test_JSFUSION.json',
+        data_root=data_root,
+        data_prefix=dict(video='videos'),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='test_JSFUSION.json',
+        data_root=data_root,
+        data_prefix=dict(video='videos'),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='RetrievalMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.05,
+        by_epoch=True,
+        begin=0,
+        end=0.5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=4.5,
+        eta_min=0,
+        by_epoch=True,
+        begin=0.5,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW',
+        lr=1e-05,
+        betas=(0.9, 0.999),
+        eps=1e-08,
+        weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.),
+)
+
+default_hooks = dict(checkpoint=dict(save_best=None))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/retrieval/clip4clip/metafile.yml b/configs/retrieval/clip4clip/metafile.yml
new file mode 100644
index 0000000000..1b798f1421
--- /dev/null
+++ b/configs/retrieval/clip4clip/metafile.yml
@@ -0,0 +1,28 @@
+Collections:
+  - Name: CLIP4Clip
+    README: configs/retrieval/clip4clip/README.md
+    Paper:
+      URL: https://arxiv.org/abs/2104.08860
+      Title: 'CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval'
+
+Models:
+  - Name: clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb
+    Config: configs/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py
+    In Collection: CLIP4Clip
+    Metadata:
+      Architecture: ViT-B/32
+      Batch Size: 16
+      Epochs: 5
+      Training Data: MSRVTT-9k
+      Training Resources: 8 GPUs
+    Results:
+      Dataset: MSRVTT
+      Task: Video Retrieval
+      Metrics:
+        Recall@1: 43.1
+        Recall@5: 69.4
+        Recall@10: 78.9
+        MdR: 2.0
+        MnR: 16.8
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/retrieval/clip4clip/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb/clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb_20230612-b9706e54.pth
diff --git a/configs/skeleton/stgcn/README.md b/configs/skeleton/stgcn/README.md
index bd04d6cdbc..31438d626e 100644
--- a/configs/skeleton/stgcn/README.md
+++ b/configs/skeleton/stgcn/README.md
@@ -63,7 +63,7 @@ Dynamics of human body skeletons convey significant information for human action
 |                         | four-stream  |      |          |  86.19   |                  |       |        |                                           |                                         |                                        |
 
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size.
-2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/useful_tools.md#multi-stream-fusion).
+2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/useful_tools.md#multi-stream-fusion.md).
 
 ## Train
 
diff --git a/dataset-index.yml b/dataset-index.yml
new file mode 100644
index 0000000000..57f2f4f62a
--- /dev/null
+++ b/dataset-index.yml
@@ -0,0 +1,39 @@
+kinetics400:
+  dataset: Kinetics-400
+  download_root: data
+  data_root: data/kinetics400
+  script: tools/data/kinetics/k400_preprocess.sh
+
+kinetics600:
+  dataset: Kinetics600
+  download_root: data
+  data_root: data/kinetics600
+  script: tools/data/kinetics/k600_preprocess.sh
+
+kinetics700:
+  dataset: Kinetics_700
+  download_root: data
+  data_root: data/kinetics700
+  script: tools/data/kinetics/k700_preprocess.sh
+
+sthv2:
+  dataset: sthv2
+  download_root: data
+  data_root: data/sthv2
+  script: tools/data/sthv2/preprocess.sh
+
+ucf-101:
+  dataset: UCF101
+  download_root: data
+  data_root: data/ucf101
+
+finegym:
+  dataset: FineGym
+  download_root: data
+  data_root: data/gym
+
+diving48:
+  dataset: diving48
+  download_root: data
+  data_root: data/diving48
+  script: tools/data/diving48/preprocess.sh
diff --git a/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
index 934a3a5bc4..0c193a7088 100644
--- a/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
+++ b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
@@ -116,7 +116,7 @@
 file_client_args = dict(backend='disk')
 
 test_pipeline = [
-    dict(type='mmdet.LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
     dict(type='mmdet.Resize', scale=(1333, 800), keep_ratio=True),
     dict(
         type='mmdet.PackDetInputs',
diff --git a/demo/demo_spatiotemporal_det.py b/demo/demo_spatiotemporal_det.py
index 9c77c6f2ca..c76e3f1a17 100644
--- a/demo/demo_spatiotemporal_det.py
+++ b/demo/demo_spatiotemporal_det.py
@@ -15,7 +15,7 @@
 from mmaction.apis import detection_inference
 from mmaction.registry import MODELS
 from mmaction.structures import ActionDataSample
-from mmaction.utils import frame_extract
+from mmaction.utils import frame_extract, get_str_type
 
 try:
     import moviepy.editor as mpy
@@ -86,7 +86,7 @@ def visualize(frames, annotations, plate=plate_blue, max_num=5):
                     if k >= max_num:
                         break
                     text = abbrev(lb)
-                    text = ': '.join([text, str(score[k])])
+                    text = ': '.join([text, f'{score[k]:>.2f}'])
                     location = (0 + st[0], 18 + k * 18 + st[1])
                     textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
                                                THICKNESS)[0]
@@ -248,7 +248,9 @@ def main():
     config.merge_from_dict(args.cfg_options)
     val_pipeline = config.val_pipeline
 
-    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
+    sampler = [
+        x for x in val_pipeline if get_str_type(x['type']) == 'SampleAVAFrames'
+    ][0]
     clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
     window_size = clip_len * frame_interval
     assert clip_len % 2 == 0, 'We would like to have an even clip_len'
diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py
index 9fba6fa86a..755102ff28 100644
--- a/demo/demo_spatiotemporal_det_onnx.py
+++ b/demo/demo_spatiotemporal_det_onnx.py
@@ -13,7 +13,7 @@
 from mmengine import DictAction
 
 from mmaction.apis import detection_inference
-from mmaction.utils import frame_extract
+from mmaction.utils import frame_extract, get_str_type
 
 try:
     import moviepy.editor as mpy
@@ -242,7 +242,9 @@ def main():
     config.merge_from_dict(args.cfg_options)
     val_pipeline = config.val_pipeline
 
-    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
+    sampler = [
+        x for x in val_pipeline if get_str_type(x['type']) == 'SampleAVAFrames'
+    ][0]
     clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
     window_size = clip_len * frame_interval
     assert clip_len % 2 == 0, 'We would like to have an even clip_len'
diff --git a/demo/webcam_demo.py b/demo/webcam_demo.py
index cdcb9fc289..cdd8585540 100644
--- a/demo/webcam_demo.py
+++ b/demo/webcam_demo.py
@@ -12,6 +12,7 @@
 from mmengine.dataset import Compose, pseudo_collate
 
 from mmaction.apis import init_recognizer
+from mmaction.utils import get_str_type
 
 FONTFACE = cv2.FONT_HERSHEY_COMPLEX_SMALL
 FONTSCALE = 1
@@ -194,12 +195,12 @@ def main():
     pipeline = cfg.test_pipeline
     pipeline_ = pipeline.copy()
     for step in pipeline:
-        if 'SampleFrames' in step['type']:
+        if 'SampleFrames' in get_str_type(step['type']):
             sample_length = step['clip_len'] * step['num_clips']
             data['num_clips'] = step['num_clips']
             data['clip_len'] = step['clip_len']
             pipeline_.remove(step)
-        if step['type'] in EXCLUED_STEPS:
+        if get_str_type(step['type']) in EXCLUED_STEPS:
             # remove step to decode frames
             pipeline_.remove(step)
     test_pipeline = Compose(pipeline_)
diff --git a/docker/Dockerfile b/docker/Dockerfile
index bf6c67a696..29a24c3ca5 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,5 +1,5 @@
-ARG PYTORCH="1.6.0"
-ARG CUDA="10.1"
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
 ARG CUDNN="7"
 
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
diff --git a/docs/en/_static/css/readthedocs.css b/docs/en/_static/css/readthedocs.css
index 07611c2b06..b2a56e8de4 100644
--- a/docs/en/_static/css/readthedocs.css
+++ b/docs/en/_static/css/readthedocs.css
@@ -4,3 +4,59 @@
     height: 40px;
     width: 130px;
 }
+
+@media screen and (min-width: 1100px) {
+    .header-logo {
+      top: -12px;
+    }
+  }
+
+  pre {
+      white-space: pre;
+  }
+
+  @media screen and (min-width: 2000px) {
+    .pytorch-content-left {
+      width: 1200px;
+      margin-left: 30px;
+    }
+    article.pytorch-article {
+      max-width: 1200px;
+    }
+    .pytorch-breadcrumbs-wrapper {
+      width: 1200px;
+    }
+    .pytorch-right-menu.scrolling-fixed {
+      position: fixed;
+      top: 45px;
+      left: 1580px;
+    }
+  }
+
+
+  article.pytorch-article section code {
+    padding: .2em .4em;
+    background-color: #f3f4f7;
+    border-radius: 5px;
+  }
+
+  /* Disable the change in tables */
+  article.pytorch-article section table code {
+    padding: unset;
+    background-color: unset;
+    border-radius: unset;
+  }
+
+  table.autosummary td {
+    width: 50%
+  }
+
+  img.align-center {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+  }
+
+  article.pytorch-article p.rubric {
+    font-weight: bold;
+  }
diff --git a/docs/en/_static/js/custom.js b/docs/en/_static/js/custom.js
new file mode 100644
index 0000000000..3204fc7ef0
--- /dev/null
+++ b/docs/en/_static/js/custom.js
@@ -0,0 +1,10 @@
+var collapsedSections = ['Dataset Zoo'];
+
+$(document).ready(function () {
+    $('.model-summary').DataTable({
+      "stateSave": false,
+      "lengthChange": false,
+      "pageLength": 20,
+      "order": []
+    });
+  });
diff --git a/docs/en/_templates/404.html b/docs/en/_templates/404.html
new file mode 100644
index 0000000000..3c808f4c23
--- /dev/null
+++ b/docs/en/_templates/404.html
@@ -0,0 +1,18 @@
+{% extends "layout.html" %}
+
+{% block body %}
+
+<h1>Page Not Found</h1>
+<p>
+  The page you are looking for cannot be found.
+</p>
+<p>
+  If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
+  the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
+</p>
+<p>
+  If you cannot find documentation you want, please <a
+    href="https://github.com/open-mmlab/mmaction2/issues/new/choose">open an issue</a> to tell us!
+</p>
+
+{% endblock %}
diff --git a/docs/en/advanced_guides/customize_dataset.md b/docs/en/advanced_guides/customize_dataset.md
index 690ce3859c..f5ab6254ae 100644
--- a/docs/en/advanced_guides/customize_dataset.md
+++ b/docs/en/advanced_guides/customize_dataset.md
@@ -9,7 +9,7 @@ In this tutorial, we will introduce some methods about how to customize your own
 
 ## General understanding of the Dataset in MMAction2
 
-MMAction2 provides task-specific `Dataset` class, e.g. `VideoDataset`/`RawframeDataset` for action recognition, `AVADataset` for spatio-temporal action detection, `PoseDataset` for skeleton-based action recognition. These task-specific datasets only require the implementation of `load_data_list(self)` for generating a data list from the annotation file. The remaining functions are automatically handled by the superclass (i.e., `BaseActionDataset` and `BaseDataset`). The following table shows the inherent relationship and the main method of the modules.
+MMAction2 provides task-specific `Dataset` class, e.g. `VideoDataset`/`RawframeDataset` for action recognition, `AVADataset` for spatio-temporal action detection, `PoseDataset` for skeleton-based action recognition. These task-specific datasets only require the implementation of `load_data_list(self)` for generating a data list from the annotation file. The remaining functions are automatically handled by the superclass (i.e., `BaseActionDataset` and `BaseDataset`). The following table shows the inheritance relationship and the main method of the modules.
 
 | Class Name                     | Class Method                                                                                                                                                               |
 | ------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
diff --git a/docs/en/advanced_guides/customize_logging.md b/docs/en/advanced_guides/customize_logging.md
index ccbbeeafed..c097995daf 100644
--- a/docs/en/advanced_guides/customize_logging.md
+++ b/docs/en/advanced_guides/customize_logging.md
@@ -9,7 +9,7 @@ MMAction2 produces a lot of logs during the running process, such as loss, itera
 
 ## Flexible Logging System
 
-The MMAction2 logging system is configured by the `LogProcessor` in [default_runtime](/configs/_base_/default_runtime.py) by default, which is equivalent to:
+The MMAction2 logging system is configured by the `LogProcessor` in [default_runtime](https://github.com/open-mmlab/mmaction2/tree/main/configs/_base_/default_runtime.py) by default, which is equivalent to:
 
 ```python
 log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
diff --git a/docs/en/advanced_guides/customize_models.md b/docs/en/advanced_guides/customize_models.md
index 3d8c0e1d4e..37e4ca399f 100644
--- a/docs/en/advanced_guides/customize_models.md
+++ b/docs/en/advanced_guides/customize_models.md
@@ -1 +1,3 @@
 # Customize Models
+
+coming soon...
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 6623d99b45..14a19907db 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -42,8 +42,17 @@ def get_version():
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode',
-    'sphinx_markdown_tables', 'sphinx_copybutton', 'myst_parser'
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'sphinx_markdown_tables',
+    'sphinx_copybutton',
+    'sphinx_tabs.tabs',
+    'notfound.extension',
+    'sphinxcontrib.jquery',
 ]
 
 # numpy and torch are required
@@ -90,26 +99,35 @@ def get_version():
         {
             'name':
             'Upstream',
-            'children': [
-                {
-                    'name': 'MMCV',
-                    'url': 'https://github.com/open-mmlab/mmcv',
-                    'description': 'Foundational library for computer vision'
-                },
-                {
-                    'name':
-                    'MMClassification',
-                    'url':
-                    'https://github.com/open-mmlab/mmclassification',
-                    'description':
-                    'Open source image classification toolbox based on PyTorch'
-                },
-                {
-                    'name': 'MMDetection',
-                    'url': 'https://github.com/open-mmlab/mmdetection',
-                    'description': 'Object detection toolbox and benchmark'
-                },
-            ]
+            'children': [{
+                'name':
+                'MMCV',
+                'url':
+                'https://github.com/open-mmlab/mmcv',
+                'description':
+                'Foundational library for computer vision'
+            }, {
+                'name':
+                'MMPreTrain',
+                'url':
+                'https://github.com/open-mmlab/mmpretrain',
+                'description':
+                'Open source pre-training toolbox based on PyTorch'
+            }, {
+                'name':
+                'MMDetection',
+                'url':
+                'https://github.com/open-mmlab/mmdetection',
+                'description':
+                'Object detection toolbox and benchmark'
+            }, {
+                'name':
+                'MMPose',
+                'url':
+                'https://github.com/open-mmlab/mmpose',
+                'description':
+                'Open-source toolbox for pose estimation based on PyTorch.'
+            }]
         },
     ],
     # Specify the language of shared menu
@@ -121,15 +139,29 @@ def get_version():
 master_doc = 'index'
 
 html_static_path = ['_static']
-html_css_files = ['css/readthedocs.css']
+html_css_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
+    'css/readthedocs.css'
+]
+html_js_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
+    'js/custom.js'
+]
 
 myst_enable_extensions = ['colon_fence']
 myst_heading_anchors = 3
 
+# The not found page
+notfound_template = '404.html'
+
 
 def builder_inited_handler(app):
-    subprocess.run(['bash', './merge_docs.sh'])
-    subprocess.run(['python', './stat.py'])
+    if subprocess.run(['python', './stat.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `stat.py`.')
+    if subprocess.run(['python', './project_zoo.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `project_zoo.py`.')
+    if subprocess.run(['python', './dataset_zoo.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `dataset_zoo.py`.')
 
 
 def setup(app):
diff --git a/docs/en/dataset_zoo.py b/docs/en/dataset_zoo.py
new file mode 100644
index 0000000000..ce89980477
--- /dev/null
+++ b/docs/en/dataset_zoo.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+import re
+from pathlib import Path
+
+from utils import replace_link
+
+DATASETS_ROOT = Path('dataset_zoo')  # Path to save generated paper pages.
+MODELZOO_TEMPLATE = """\
+# Dataset Zoo Summary
+
+In this page, we list [all datasets](#all-supported-datasets) we support. You can click the link to jump to the corresponding dataset pages.
+
+## All supported datasets
+
+* Number of datasets: {num_datasets}
+{dataset_msg}
+
+"""  # noqa: E501
+
+
+def generate_datasets_pages():
+    dataset_list = Path('../../tools/data').glob('*/README.md')
+    num_datasets = 0
+    dataset_msgs = []
+
+    for file in dataset_list:
+        num_datasets += 1
+
+        copy = DATASETS_ROOT / file.parent.with_suffix('.md').name
+
+        with open(file, 'r') as f:
+            content = f.read()
+
+        title = re.match(r'^# Preparing (.*)', content).group(1)
+        content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                               file)
+        content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                               file)
+        dataset_msgs.append(f'\t - [{title}]({copy})')
+
+        with open(copy, 'w') as f:
+            f.write(content)
+
+    dataset_msg = '\n'.join(dataset_msgs)
+
+    modelzoo = MODELZOO_TEMPLATE.format(
+        num_datasets=num_datasets,
+        dataset_msg=dataset_msg,
+    )
+
+    with open('datasetzoo_statistics.md', 'w') as f:
+        f.write(modelzoo)
+
+
+DATASETS_ROOT.mkdir(exist_ok=True)
+generate_datasets_pages()
diff --git a/docs/en/docutils.conf b/docs/en/docutils.conf
new file mode 100644
index 0000000000..0c00c84688
--- /dev/null
+++ b/docs/en/docutils.conf
@@ -0,0 +1,2 @@
+[html writers]
+table_style: colwidths-auto
diff --git a/docs/en/get_started/contribution_guide.md b/docs/en/get_started/contribution_guide.md
index 3748329873..5e28623a77 100644
--- a/docs/en/get_started/contribution_guide.md
+++ b/docs/en/get_started/contribution_guide.md
@@ -5,12 +5,12 @@ All kinds of contributions are welcome, including but not limited to the followi
 - Fixes (typo, bugs)
 - New features and components
 - Add documentation or translate the documentation into other languages
-- Add new project (Recommended) about video understanding algorithm with less restriction, refer to [here](/projects/README.md) for details
+- Add new project (Recommended) about video understanding algorithm with less restriction, refer to [here](../projectzoo.md) for details
 
 ## Workflow
 
 1. Fork and pull the latest mmaction2
-2. Checkout a new branch with a meaningful name (do not use master branch for PRs)
+2. Checkout a new branch with a meaningful name (do not use main branch for PRs)
 3. Commit your changes
 4. Create a PR
 
diff --git a/docs/en/get_started/faq.md b/docs/en/get_started/faq.md
index 45d1b6889a..08679a8349 100644
--- a/docs/en/get_started/faq.md
+++ b/docs/en/get_started/faq.md
@@ -12,7 +12,7 @@ We list some common issues faced by many users and their corresponding solutions
   - [Testing](#testing)
 
 Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them.
-If the contents here do not cover your issue, please create an issue using the [provided templates](/.github/ISSUE_TEMPLATE/error-report.md) and make sure to fill in all required information in the template.
+If the contents here do not cover your issue, please create an issue using the [provided templates](https://github.com/open-mmlab/mmaction2/tree/main/.github/ISSUE_TEMPLATE/error-report.md) and make sure to fill in all required information in the template.
 
 ## Installation
 
@@ -88,7 +88,7 @@ If the contents here do not cover your issue, please create an issue using the [
 
 - **How to set `load_from` value in config files to finetune models?**
 
-  In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](/docs/en/user_guides/config.md),
+  In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](https://github.com/open-mmlab/mmaction2/tree/main/docs/en/user_guides/config.md),
   users can directly change it by setting `load_from` in their configs.
 
 - **How to use `RawFrameDataset` for training?**
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
index 18b0545f64..8cc64b7798 100644
--- a/docs/en/get_started/installation.md
+++ b/docs/en/get_started/installation.md
@@ -4,7 +4,7 @@
 
 In this section we demonstrate how to prepare an environment with PyTorch.
 
-MMAction2 works on Linux, Windows and macOS. It requires Python 3.7+, CUDA 9.2+ and PyTorch 1.6+.
+MMAction2 works on Linux, Windows and macOS. It requires Python 3.7+, CUDA 10.2+ and PyTorch 1.8+.
 
 ```{note}
 If you are experienced with PyTorch and have already installed it, just skip this part and jump to the [next section](#installation). Otherwise, you can follow these steps for the preparation.
@@ -185,7 +185,7 @@ We provide a [Dockerfile](https://github.com/open-mmlab/mmaction2/blob/main/dock
 to build an image. Ensure that your [docker version](https://docs.docker.com/engine/install/) >=19.03.
 
 ```shell
-# build an image with PyTorch 1.6.0, CUDA 10.1, CUDNN 7.
+# build an image with PyTorch 1.8.1, CUDA 10.2, CUDNN 7.
 # If you prefer other versions, just modified the Dockerfile
 docker build -f ./docker/Dockerfile --rm -t mmaction2 .
 ```
diff --git a/docs/en/get_started/overview.md b/docs/en/get_started/overview.md
index 498b093830..3eae38e089 100644
--- a/docs/en/get_started/overview.md
+++ b/docs/en/get_started/overview.md
@@ -77,8 +77,8 @@ We have prepared a wealth of documents to meet your various needs:
 <details open>
 <summary><b>For supported model zoo and dataset zoo</b></summary>
 
-- [Model Zoo](../model_zoo/modelzoo.md)
-- [Dataset Zoo](../datasetzoo.md)
+- [Model Zoo](../modelzoo_statistics.md)
+- [Dataset Zoo](../datasetzoo_statistics.md)
 
 </details>
 
diff --git a/docs/en/get_started/quick_run.md b/docs/en/get_started/quick_run.md
index 619e904370..ab7bd569c3 100644
--- a/docs/en/get_started/quick_run.md
+++ b/docs/en/get_started/quick_run.md
@@ -1,6 +1,6 @@
 # Quick Run
 
-This chapter will introduce you to the fundamental functionalities of MMAction2. We assume that you have [installed MMAction2 from source](../installation#best-practices).
+This chapter will introduce you to the fundamental functionalities of MMAction2. We assume that you have [installed MMAction2 from source](installation.md#best-practices).
 
 - [Quick Run](#quick-run)
   - [Inference](#inference)
@@ -39,7 +39,7 @@ You should be able to see a pop-up video and the inference result printed out in
 If you are running MMAction2 on a server without a GUI or via an SSH tunnel with X11 forwarding disabled, you may not see the pop-up window.
 ```
 
-A detailed description of MMAction2's inference interface can be found [here](/demo/README.md#inferencer).
+A detailed description of MMAction2's inference interface can be found [here](https://github.com/open-mmlab/mmaction2/tree/main/demo/README.md#inferencer).
 
 In addition to using our well-provided pre-trained models, you can also train models on your own datasets. In the next section, we will take you through the basic functions of MMAction2 by training TSN on the tiny [Kinetics](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset as an example.
 
@@ -119,7 +119,7 @@ For a more detailed description of config, please refer to [here](../user_guides
 
 ## Browse the Dataset
 
-Before we start the training, we can also visualize the frames processed by training-time data transforms. It's quite simple: pass the config file we need to visualize into the [browse_dataset.py](/tools/analysis_tools/browse_dataset.py) script.
+Before we start the training, we can also visualize the frames processed by training-time data transforms. It's quite simple: pass the config file we need to visualize into the [browse_dataset.py](https://github.com/open-mmlab/mmaction2/tree/main/tools/analysis_tools/browse_dataset.py) script.
 
 ```Bash
 python tools/visualizations/browse_dataset.py \
@@ -212,7 +212,7 @@ And get the outputs like:
 03/24 17:00:59 - mmengine - INFO - Epoch(test) [10/10]  acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 0.9000data_time: 0.0420  time: 1.0795
 ```
 
-The model achieves an hmean of 1.0000 on this dataset.
+The model achieves an top1-accuracy of 1.0000 on this dataset.
 
 ```{note}
 For advanced usage of testing, such as CPU testing, multi-GPU testing, and cluster testing, please refer to [Training and Testing](../user_guides/train_test.md).
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 7d39be0d18..32d36cc609 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -35,47 +35,45 @@ You can switch between Chinese and English documents in the lower-left corner of
    advanced_guides/customize_optimizer.md
    advanced_guides/customize_logging.md
    advanced_guides/deploy.md
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Advanced Guides
    useful_tools.md
 
 .. toctree::
    :maxdepth: 1
-   :caption: Migration
+   :caption: Model Zoo
 
-   migration.md
+   modelzoo_statistics.md
+   model_zoo/recognition.md
+   model_zoo/recognition_audio.md
+   model_zoo/skeleton.md
+   model_zoo/detection.md
+   model_zoo/retrieval.md
+   model_zoo/localization.md
 
 .. toctree::
    :maxdepth: 1
-   :caption: API Reference
+   :caption: Dataset Zoo
+   :glob:
 
-   api.rst
+   datasetzoo_statistics.md
+   dataset_zoo/*
 
 .. toctree::
    :maxdepth: 1
-   :caption: Model Zoo
-
-   model_zoo/modelzoo.md
-   model_zoo/recognition_models.md
-   model_zoo/detection_models.md
-   model_zoo/skeleton_models.md
-   model_zoo/localization_models.md
+   :caption: Projects
 
+   projectzoo.md
 
 .. toctree::
    :maxdepth: 1
-   :caption: Dataset Zoo
+   :caption: Migration
 
-   datasetzoo_overview.md
-   datasetzoo.md
+   migration.md
 
 .. toctree::
    :maxdepth: 1
-   :caption: Projects
+   :caption: API Reference
 
-   projectzoo.md
+   api.rst
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/en/merge_docs.sh b/docs/en/merge_docs.sh
deleted file mode 100644
index a2a4e0ba6c..0000000000
--- a/docs/en/merge_docs.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env bash
-
-# gather models
-mkdir -p model_zoo
-cat  ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/main/=g' |sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/localization_models.md
-cat  ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/main/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/recognition_models.md
-cat  ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/main/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >>  model_zoo/recognition_models.md
-cat  ../../configs/detection/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/main/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/detection_models.md
-cat  ../../configs/skeleton/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/main/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/skeleton_models.md
-
-# gather projects
-# TODO: generate table of contents for project zoo
-cat ../../projects/README.md > projectzoo.md
-cat ../../projects/example_project/README.md >> projectzoo.md
-cat ../../projects/ctrgcn/README.md >> projectzoo.md
-cat ../../projects/msg3d/README.md >> projectzoo.md
-
-# gather datasets
-cat supported_datasets.md > datasetzoo.md
-cat  ../../tools/data/*/README.md | sed 's/# Preparing/# /g' | sed 's/#/#&/' >> datasetzoo.md
-
-sed -i 's/(\/tools\/data\/activitynet\/README.md/(#activitynet/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/kinetics\/README.md/(#kinetics-400600700/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/mit\/README.md/(#moments-in-time/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/mmit\/README.md/(#multi-moments-in-time/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/sthv1\/README.md/(#something-something-v1/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/sthv2\/README.md/(#something-something-v2/g' datasetzoo.md
-sed -i "s/(\/tools\/data\/thumos14\/README.md/(#thumos14/g" datasetzoo.md
-sed -i 's/(\/tools\/data\/ucf101\/README.md/(#ucf-101/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/ucf101_24\/README.md/(#ucf101-24/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/jhmdb\/README.md/(#jhmdb/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/hvu\/README.md/(#hvu/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/hmdb51\/README.md/(#hmdb51/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/jester\/README.md/(#jester/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/ava\/README.md/(#ava/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/gym\/README.md/(#gym/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/omnisource\/README.md/(#omnisource/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/diving48\/README.md/(#diving48/g' datasetzoo.md
-sed -i 's/(\/tools\/data\/skeleton\/README.md/(#skeleton-dataset/g' datasetzoo.md
-
-cat prepare_data.md >> datasetzoo.md
-
-sed -i 's=](/=](https://github.com/open-mmlab/mmaction2/tree/main/=g' *.md
-sed -i 's=](/=](https://github.com/open-mmlab/mmaction2/tree/main/=g' */*.md
-
-sed -i 's/](\/docs\/en\//](g' datasetzoo.md
-sed -i 's/](\/docs\/en\//](g' notes/changelog.md
-sed -i 's/](\/docs\/en\//](..g' ./get_stated/*.md
-sed -i 's/](\/docs\/en\//](..g' ./tutorials/*.md
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index 88d223ff57..66dfd3b144 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,73 @@
 # Changelog
 
+## 1.1.0 (7/3/2023)
+
+**Highlights**
+
+- Support HACS-segments dataset(ICCV'2019), MultiSports dataset(ICCV'2021), Kinetics-710 dataset(Arxiv'2022)
+- Support rich projects: gesture recognition, spatio-temporal action detection tutorial, and knowledge distillation
+- Support TCANet(CVPR'2021)
+- Support VideoMAE V2(CVPR'2023), and VideoMAE(NeurIPS'2022) on action detection
+- Support CLIP-based multi-modality models: ActionCLIP(Arxiv'2021) and CLIP4clip(ArXiv'2022)
+- Support [Pure Python style Configuration File](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) and downloading datasets by MIM
+
+**New Features**
+
+- Support HACS-segments dataset ([2224](https://github.com/open-mmlab/mmaction2/pull/2224))
+- Support TCANet ([2271](https://github.com/open-mmlab/mmaction2/pull/2271))
+- Support MultiSports dataset ([2280](https://github.com/open-mmlab/mmaction2/pull/2280))
+- Support spatio-temporal action detection tutorial ([2428](https://github.com/open-mmlab/mmaction2/pull/2428))
+- Support knowledge distillation based on MMRazor ([2458](https://github.com/open-mmlab/mmaction2/pull/2458))
+- Support VideoMAE V2 ([2460](https://github.com/open-mmlab/mmaction2/pull/2460))
+- Support ActionCLIP ([2470](https://github.com/open-mmlab/mmaction2/pull/2470))
+- Support CLIP4clip ([2489](https://github.com/open-mmlab/mmaction2/pull/2489))
+- Support Kinetics-710 dataset ([2534](https://github.com/open-mmlab/mmaction2/pull/2534))
+- Support gesture recognition project ([2539](https://github.com/open-mmlab/mmaction2/pull/2539))
+- Support VideoMAE on action detection ([2547](https://github.com/open-mmlab/mmaction2/pull/2547))
+- Support downloading datasets by MIM ([2465](https://github.com/open-mmlab/mmaction2/pull/1465))
+- Support new config ([2542](https://github.com/open-mmlab/mmaction2/pull/2542))
+
+**Improvements**
+
+- Refactor TSM init_weights ([2396](https://github.com/open-mmlab/mmaction2/pull/2396))
+- Add unit test for Recognizer 2D ([2432](https://github.com/open-mmlab/mmaction2/pull/2432))
+- Enhance inference APIs ([2472](https://github.com/open-mmlab/mmaction2/pull/2472))
+- Support converting ST-GCN and PoseC3D to ONNX ([2543](https://github.com/open-mmlab/mmaction2/pull/2543))
+- Support feature extraction head ([2525](https://github.com/open-mmlab/mmaction2/pull/2525))
+
+**Bug Fixes**
+
+- Fix CircleCI ([2351](https://github.com/open-mmlab/mmaction2/pull/2351))
+- Fix demo skeleton script ([2380](https://github.com/open-mmlab/mmaction2/pull/2380))
+- Fix docker file branch ([2397](https://github.com/open-mmlab/mmaction2/pull/2397))
+- Fix NTU pose extraction script ([2402](https://github.com/open-mmlab/mmaction2/pull/2402))
+- Rename typing and enhance collect_env script ([2420](https://github.com/open-mmlab/mmaction2/pull/2420))
+- Fix multi-label classification ([2425](https://github.com/open-mmlab/mmaction2/pull/2425), [2466](https://github.com/open-mmlab/mmaction2/pull/2466), [2532](https://github.com/open-mmlab/mmaction2/pull/2532))
+- Fix lfb configs ([2426](https://github.com/open-mmlab/mmaction2/pull/2426))
+- Fix a warning caused by `torch.div` ([2449](https://github.com/open-mmlab/mmaction2/pull/2449))
+- Fix incompatibility of ImgAug and latest Numpy ([2451](https://github.com/open-mmlab/mmaction2/pull/2451))
+- Fix MViT with_cls_token argument ([2480](https://github.com/open-mmlab/mmaction2/pull/2480))
+- Fix timm BC-breaking for TSN ([2497](https://github.com/open-mmlab/mmaction2/pull/2497))
+- Close FileHandler in Windows to make the temporary directory can be deleted  ([2565](https://github.com/open-mmlab/mmaction2/pull/2565))
+- Update minimum PyTorch version to 1.8.1 ([2568](https://github.com/open-mmlab/mmaction2/pull/2568))
+
+**Documentation**
+
+- Fix document links in README ([2358](https://github.com/open-mmlab/mmaction2/pull/2358), [2372](https://github.com/open-mmlab/mmaction2/pull/2372), [2376](https://github.com/open-mmlab/mmaction2/pull/2376), [2382](https://github.com/open-mmlab/mmaction2/pull/2382))
+- Update installation document ([2362](https://github.com/open-mmlab/mmaction2/pull/2362))
+- Update upstream library version requirement ([2383](https://github.com/open-mmlab/mmaction2/pull/2383))
+- Fix Colab tutorial ([2384](https://github.com/open-mmlab/mmaction2/pull/2384), [2391](https://github.com/open-mmlab/mmaction2/pull/2391), [2475](https://github.com/open-mmlab/mmaction2/pull/2475))
+- Refine documents ([2404](https://github.com/open-mmlab/mmaction2/pull/2404))
+- Update outdated config in readme ([2419](https://github.com/open-mmlab/mmaction2/pull/2419))
+- Update OpenMMLab related repo list ([2429](https://github.com/open-mmlab/mmaction2/pull/2429))
+- Fix UniFormer README and metafile ([2450](https://github.com/open-mmlab/mmaction2/pull/2450))
+- Add finetune document ([2457](https://github.com/open-mmlab/mmaction2/pull/2457))
+- Update FAQ document ([2476](https://github.com/open-mmlab/mmaction2/pull/2476), [2482](https://github.com/open-mmlab/mmaction2/pull/2482)
+- Update download datasets document ([2495](https://github.com/open-mmlab/mmaction2/pull/2495))
+- Translate Chinese document ([2516](https://github.com/open-mmlab/mmaction2/pull/2516), [2506](https://github.com/open-mmlab/mmaction2/pull/2506), [2499](https://github.com/open-mmlab/mmaction2/pull/2499))
+- Refactor model zoo and dataset zoo ([2552](https://github.com/open-mmlab/mmaction2/pull/2552))
+- Refactor Chinese document ([2567](https://github.com/open-mmlab/mmaction2/pull/2567))
+
 ## 1.0.0 (4/6/2023)
 
 **Highlights**
diff --git a/docs/en/project_zoo.py b/docs/en/project_zoo.py
new file mode 100644
index 0000000000..ba921cd318
--- /dev/null
+++ b/docs/en/project_zoo.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+from pathlib import Path
+
+from utils import replace_link
+
+# This script reads /projects/*/README.md and generate projectzoo.md
+
+all_files = list(Path('../../projects/').glob('*/README.md'))
+example_project = '../../projects/example_project/README.md'
+all_files.remove(Path(example_project))
+all_files.insert(0, Path(example_project))
+
+project_zoo = open('../../projects/README.md').read()
+for file in all_files:
+    with open(file) as f:
+        content = f.read()
+        content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                               file)
+        content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                               file)
+
+        project_zoo += content
+
+with open('projectzoo.md', 'w') as f:
+    f.write(project_zoo)
diff --git a/docs/en/stat.py b/docs/en/stat.py
index b07d123fa8..dedb485b39 100644
--- a/docs/en/stat.py
+++ b/docs/en/stat.py
@@ -1,174 +1,268 @@
 #!/usr/bin/env python
-# Copyright (c) OpenMMLab. All rights reserved.
-import functools as func
-import glob
 import re
-from os.path import basename, splitext
+import shutil
+from collections import defaultdict
+from pathlib import Path
 
-import numpy as np
-import titlecase
+from modelindex.load_model_index import load
+from modelindex.models.Result import Result
+from tabulate import tabulate
+from utils import replace_link
 
+MMACT_ROOT = Path(__file__).absolute().parents[2]
+PAPERS_ROOT = Path('model_zoo')  # Path to save generated paper pages.
+GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/'
+MODELZOO_TEMPLATE = """\
+# Model Zoo Summary
 
-def anchor(name):
-    return re.sub(r'-+', '-', re.sub(r'[^a-zA-Z0-9]', '-',
-                                     name.strip().lower())).strip('-')
+In this page, we list [all algorithms](#all-supported-algorithms) we support. You can click the link to jump to the corresponding model pages.
 
+And we also list all checkpoints for different tasks we provide. You can sort or search checkpoints in the table and click the corresponding link to model pages for more details.
 
-# Count algorithms
+## All supported algorithms
 
-files = sorted(glob.glob('model_zoo/*_models.md'))
-# files = sorted(glob.glob('docs/*_models.md'))
+* Number of papers: {num_papers}
+{type_msg}
 
-stats = []
+* Number of checkpoints: {num_ckpts}
+{paper_msg}
 
-for f in files:
-    with open(f, 'r') as content_file:
-        content = content_file.read()
-
-    # title
-    title = content.split('\n')[0].replace('#', '')
-
-    # skip IMAGE and ABSTRACT tags
-    content = [
-        x for x in content.split('\n')
-        if 'IMAGE' not in x and 'ABSTRACT' not in x
-    ]
-    content = '\n'.join(content)
-
-    # count papers
-    papers = set(
-        (papertype, titlecase.titlecase(paper.lower().strip()))
-        for (papertype, paper) in re.findall(
-            r'<!--\s*\[([A-Z]*?)\]\s*-->\s*\n.*?\btitle\s*=\s*{(.*?)}',
-            content, re.DOTALL))
-    # paper links
-    revcontent = '\n'.join(list(reversed(content.splitlines())))
-    paperlinks = {}
-    for _, p in papers:
-        print(p)
-        q = p.replace('\\', '\\\\').replace('?', '\\?')
-        paperlinks[p] = ' '.join(
-            (f'[->]({splitext(basename(f))[0]}.html#{anchor(paperlink)})'
-             for paperlink in re.findall(
-                 rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
-                 revcontent, re.DOTALL | re.IGNORECASE)))
-        print('   ', paperlinks[p])
-    paperlist = '\n'.join(
-        sorted(f'    - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
-    # count configs
-    configs = set(x.lower().strip()
-                  for x in re.findall(r'https.*configs/.*\.py', content))
-
-    # count ckpts
-    ckpts = set(x.lower().strip()
-                for x in re.findall(r'https://download.*\.pth', content)
-                if 'mmaction' in x)
-
-    statsmsg = f"""
-## [{title}]({f})
-
-* Number of checkpoints: {len(ckpts)}
-* Number of configs: {len(configs)}
-* Number of papers: {len(papers)}
-{paperlist}
-
-    """
-
-    stats.append((papers, configs, ckpts, statsmsg))
-
-allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _, _ in stats])
-allconfigs = func.reduce(lambda a, b: a.union(b), [c for _, c, _, _ in stats])
-allckpts = func.reduce(lambda a, b: a.union(b), [c for _, _, c, _ in stats])
-msglist = '\n'.join(x for _, _, _, x in stats)
-
-papertypes, papercounts = np.unique([t for t, _ in allpapers],
-                                    return_counts=True)
-countstr = '\n'.join(
-    [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
-
-modelzoo = f"""
-# Overview
-
-* Number of checkpoints: {len(allckpts)}
-* Number of configs: {len(allconfigs)}
-* Number of papers: {len(allpapers)}
-{countstr}
-
-For supported datasets, see [datasets overview](datasets.md).
-
-{msglist}
-"""
-
-with open('model_zoo/modelzoo.md', 'w') as f:
-    f.write(modelzoo)
-
-# Count datasets
-
-files = ['datasetzoo.md']
-# files = sorted(glob.glob('docs/tasks/*.md'))
-
-datastats = []
-
-for f in files:
-    with open(f, 'r') as content_file:
-        content = content_file.read()
-
-    # title
-    title = content.split('\n')[0].replace('#', '')
-
-    # count papers
-    papers = set(
-        (papertype, titlecase.titlecase(paper.lower().strip()))
-        for (papertype, paper) in re.findall(
-            r'<!--\s*\[([A-Z]*?)\]\s*-->\s*\n.*?\btitle\s*=\s*{(.*?)}',
-            content, re.DOTALL))
-    # paper links
-    revcontent = '\n'.join(list(reversed(content.splitlines())))
-    paperlinks = {}
-    for _, p in papers:
-        print(p)
-        q = p.replace('\\', '\\\\').replace('?', '\\?')
-        paperlinks[p] = ', '.join(
-            (f'[{p.strip()} ->]({splitext(basename(f))[0]}.html#{anchor(p)})'
-             for p in re.findall(
-                 rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
-                 revcontent, re.DOTALL | re.IGNORECASE)))
-        print('   ', paperlinks[p])
-    paperlist = '\n'.join(
-        sorted(f'    - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
-
-    statsmsg = f"""
-## [{title}]({f})
-
-* Number of papers: {len(papers)}
-{paperlist}
-
-    """
-
-    datastats.append((papers, configs, ckpts, statsmsg))
-
-alldatapapers = func.reduce(lambda a, b: a.union(b),
-                            [p for p, _, _, _ in datastats])
+"""  # noqa: E501
 
-# Summarize
+METRIC_ALIAS = {
+    'Top 1 Accuracy': 'Top-1 (%)',
+    'Top 5 Accuracy': 'Top-5 (%)',
+}
 
-msglist = '\n'.join(x for _, _, _, x in stats)
-datamsglist = '\n'.join(x for _, _, _, x in datastats)
-papertypes, papercounts = np.unique([t for t, _ in alldatapapers],
-                                    return_counts=True)
-countstr = '\n'.join(
-    [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
+TASK_MAP = dict(
+    detection='Spatio Temporal Action Detection Models',
+    localization='Action Localization Models',
+    recognition='Action Recognition Models',
+    skeleton='Skeleton-based Action Recognition Models',
+    retrieval='Video Retrieval Models',
+    recognition_audio='Audio-based Action Recognition Models')
 
-datasetzoo = f"""
-# Overview
+model_index = load(str(MMACT_ROOT / 'model-index.yml'))
 
-* Number of papers: {len(alldatapapers)}
-{countstr}
 
-For supported action algorithms, see [modelzoo overview](modelzoo.md).
+def build_collections(model_index):
+    # add models for collections
+    col_by_name = {}
+    for col in model_index.collections:
+        setattr(col, 'models', [])
+        col_by_name[col.name] = col
 
-{datamsglist}
-"""
+    for model in model_index.models:
+        col = col_by_name[model.in_collection]
+        col.models.append(model)
+        setattr(model, 'collection', col)
+        if model.results is None:
+            setattr(model, 'tasks', [])
+        else:
+            setattr(model, 'tasks', [result.task for result in model.results])
 
-with open('datasetzoo_overview.md', 'w') as f:
-    f.write(datasetzoo)
+
+build_collections(model_index)
+
+# save a map from model name to title in README
+model2title = dict()
+
+
+def count_papers(collections):
+    total_num_ckpts = 0
+    type_count = defaultdict(int)
+    paper_msgs = []
+
+    for collection in collections:
+        with open(MMACT_ROOT / collection.readme) as f:
+            readme = f.read()
+
+        ckpts = set(x.lower().strip()
+                    for x in re.findall(r'\[ckpt.*\]\((https?.*)\)', readme))
+        total_num_ckpts += len(ckpts)
+        title = collection.paper['Title']
+        papertype = collection.data.get('type', 'Algorithm')
+        type_count[papertype] += 1
+
+        readme_title = re.search(r'^#\s+.+', readme)
+
+        readme = Path(collection.filepath).parents[1].with_suffix('.md').name
+        model = Path(collection.filepath).parent.name
+        model2title[model] = readme_title.group()[2:].replace(' ', '-')
+        paper_msgs.append(f'\t- [{papertype}] [{title}]({PAPERS_ROOT / readme}'
+                          f'#{model2title[model]}) ({len(ckpts)} ckpts)')
+
+    type_msg = '\n'.join(
+        [f'\t- {type_}: {count}' for type_, count in type_count.items()])
+    paper_msg = '\n'.join(paper_msgs)
+
+    modelzoo = MODELZOO_TEMPLATE.format(
+        num_papers=len(collections),
+        num_ckpts=total_num_ckpts,
+        type_msg=type_msg,
+        paper_msg=paper_msg,
+    )
+
+    with open('modelzoo_statistics.md', 'w') as f:
+        f.write(modelzoo)
+
+
+count_papers(model_index.collections)
+
+
+def generate_paper_page(collection):
+
+    # Write a copy of README
+    with open(MMACT_ROOT / collection.readme) as f:
+        content = f.read()
+    readme_path = Path(collection.filepath)
+    copy = PAPERS_ROOT / readme_path.parents[1].with_suffix('.md').name
+    if not copy.exists():
+        with open(copy, 'w') as copy_file:
+            task = readme_path.parents[1].name
+            head_content = f'# {TASK_MAP[task]}\n'
+            copy_file.write(head_content)
+
+    def lower_heading(match):
+        return '#' + match.group()
+
+    content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                           Path(collection.readme))
+    content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                           Path(collection.readme))
+
+    content = re.sub(r'^#+\s+.+', lower_heading, content, flags=re.M)
+
+    with open(copy, 'a') as copy_file:
+        copy_file.write(content)
+
+
+if PAPERS_ROOT.exists():
+    shutil.rmtree(PAPERS_ROOT)
+PAPERS_ROOT.mkdir(exist_ok=True)
+for collection in model_index.collections:
+    generate_paper_page(collection)
+
+
+def scatter_results(models):
+    model_result_pairs = []
+    for model in models:
+        if model.results is None:
+            result = Result(task=None, dataset=None, metrics={})
+            model_result_pairs.append((model, result))
+        else:
+            for result in model.results:
+                model_result_pairs.append((model, result))
+    return model_result_pairs
+
+
+def generate_summary_table(task, model_result_pairs, title=None):
+    metrics = set()
+    for model, result in model_result_pairs:
+        if result.task == task:
+            metrics = metrics.union(result.metrics.keys())
+    metrics = sorted(list(metrics))
+
+    rows = []
+
+    def convert2float(number):
+        units = {'M': 1e6, 'G': 1e9, 'T': 1e12}
+        if isinstance(number, str):
+            num = float(number.rstrip('MGT'))
+            number = num * units[number[-1]]
+        return number
+
+    for model, result in model_result_pairs:
+        if result.task != task:
+            continue
+        name = model.name
+        if model.metadata.parameters is not None:
+            params = convert2float(model.metadata.parameters)
+            params = f'{params / 1e6:.2f}'  # Params
+        else:
+            params = None
+        if model.metadata.flops is not None:
+            flops = convert2float(model.metadata.flops)
+            flops = f'{flops / 1e9:.2f}'  # Flops
+        else:
+            flops = None
+
+        readme = Path(
+            model.collection.filepath).parents[1].with_suffix('.md').name
+        model = Path(model.collection.filepath).parent.name
+        page = f'[link]({PAPERS_ROOT / readme}#{model2title[model]})'
+        model_metrics = []
+        for metric in metrics:
+            model_metrics.append(str(result.metrics.get(metric, '')))
+
+        rows.append([name, params, flops, *model_metrics, page])
+
+    with open('modelzoo_statistics.md', 'a') as f:
+        if title is not None:
+            f.write(f'\n{title}')
+        f.write("""\n```{table}\n:class: model-summary\n""")
+        header = [
+            'Model',
+            'Params (M)',
+            'Flops (G)',
+            *[METRIC_ALIAS.get(metric, metric) for metric in metrics],
+            'Readme',
+        ]
+        table_cfg = dict(
+            tablefmt='pipe',
+            floatfmt='.2f',
+            numalign='right',
+            stralign='center')
+        f.write(tabulate(rows, header, **table_cfg))
+        f.write('\n```\n')
+
+
+def generate_dataset_wise_table(task, model_result_pairs, title=None):
+    dataset_rows = defaultdict(list)
+    for model, result in model_result_pairs:
+        if result.task == task:
+            dataset_rows[result.dataset].append((model, result))
+
+    if title is not None:
+        with open('modelzoo_statistics.md', 'a') as f:
+            f.write(f'\n{title}')
+    for dataset, pairs in dataset_rows.items():
+        generate_summary_table(task, pairs, title=f'### {dataset}')
+
+
+model_result_pairs = scatter_results(model_index.models)
+
+# Generate Action Recognition Summary
+generate_dataset_wise_table(
+    task='Action Recognition',
+    model_result_pairs=model_result_pairs,
+    title='## Action Recognition',
+)
+
+# Generate Action Detection Summary
+generate_dataset_wise_table(
+    task='Action Detection',
+    model_result_pairs=model_result_pairs,
+    title='## Action Detection',
+)
+
+# Generate Skeleton-based Action Recognition Summary
+generate_dataset_wise_table(
+    task='Skeleton-based Action Recognition',
+    model_result_pairs=model_result_pairs,
+    title='## Skeleton-based Action Recognition',
+)
+
+# Generate Video Retrieval Summary
+generate_dataset_wise_table(
+    task='Video Retrieval',
+    model_result_pairs=model_result_pairs,
+    title='## Video Retrieval',
+)
+
+# Generate Temporal Action Localization Summary
+generate_dataset_wise_table(
+    task='Temporal Action Localization',
+    model_result_pairs=model_result_pairs,
+    title='## Temporal Action Localization',
+)
diff --git a/docs/en/supported_datasets.md b/docs/en/supported_datasets.md
deleted file mode 100644
index 42911fc8ff..0000000000
--- a/docs/en/supported_datasets.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Supported Datasets
-
-- Action Recognition
-
-  - [UCF101](/tools/data/ucf101/README.md) \[ [Homepage](https://www.crcv.ucf.edu/research/data-sets/ucf101/) \].
-  - [HMDB51](/tools/data/hmdb51/README.md) \[ [Homepage](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/) \].
-  - [Kinetics-\[400/600/700\]](/tools/data/kinetics/README.md) \[ [Homepage](https://deepmind.com/research/open-source/kinetics) \]
-  - [Something-Something V1](/tools/data/sthv1/README.md) \[ [Homepage](https://20bn.com/datasets/something-something/v1) \]
-  - [Something-Something V2](/tools/data/sthv2/README.md) \[ [Homepage](https://20bn.com/datasets/something-something) \]
-  - [Moments in Time](/tools/data/mit/README.md) \[ [Homepage](http://moments.csail.mit.edu/) \]
-  - [Multi-Moments in Time](/tools/data/mmit/README.md) \[ [Homepage](http://moments.csail.mit.edu/challenge_iccv_2019.html) \]
-  - [HVU](/tools/data/hvu/README.md) \[ [Homepage](https://github.com/holistic-video-understanding/HVU-Dataset) \]
-  - [Jester](/tools/data/jester/README.md) \[ [Homepage](https://developer.qualcomm.com/software/ai-datasets/jester) \]
-  - [GYM](/tools/data/gym/README.md) \[ [Homepage](https://sdolivia.github.io/FineGym/) \]
-  - [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \]
-  - [Diving48](/tools/data/diving48/README.md) \[ [Homepage](http://www.svcl.ucsd.edu/projects/resound/dataset.html) \]
-  - [OmniSource](/tools/data/omnisource/README.md) \[ [Homepage](https://kennymckormick.github.io/omnisource/) \]
-
-- Temporal Action Detection
-
-  - [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \]
-  - [THUMOS14](/tools/data/thumos14/README.md) \[ [Homepage](https://www.crcv.ucf.edu/THUMOS14/download.html) \]
-
-- Spatial Temporal Action Detection
-
-  - [AVA](/tools/data/ava/README.md) \[ [Homepage](https://research.google.com/ava/index.html) \]
-  - [UCF101-24](/tools/data/ucf101_24/README.md) \[ [Homepage](http://www.thumos.info/download.html) \]
-  - [JHMDB](/tools/data/jhmdb/README.md) \[ [Homepage](http://jhmdb.is.tue.mpg.de/) \]
-
-- Skeleton-based Action Recognition
-
-  - [PoseC3D Skeleton Dataset](/tools/data/skeleton/README.md) \[ [Homepage](https://kennymckormick.github.io/posec3d/) \]
-
-The supported datasets are listed above.
-We provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`.
-Below is the detailed tutorials of data deployment for each dataset.
diff --git a/docs/en/switch_language.md b/docs/en/switch_language.md
index 80cf0dc571..4bade2237f 100644
--- a/docs/en/switch_language.md
+++ b/docs/en/switch_language.md
@@ -1,3 +1,3 @@
 ## <a href='https://mmaction2.readthedocs.io/en/latest/'>English</a>
 
-## <a href='https://mmaction2.readthedocs.io/zh_CN/1.x/'>简体中文</a>
+## <a href='https://mmaction2.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/docs/en/useful_tools.md b/docs/en/useful_tools.md
index 943303b82c..60be90fc8f 100644
--- a/docs/en/useful_tools.md
+++ b/docs/en/useful_tools.md
@@ -36,7 +36,7 @@ python tools/deployment/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
 
 E.g.,
 
-```shelltsn_r50
+```shell
 python tools/deployment/publish_model.py work_dirs/tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb/latest.pth tsn_r50_1x1x3_100e_kinetics400_rgb.pth
 ```
 
@@ -85,7 +85,7 @@ Take joint-bone fusion as an example, which is a general practice in the task of
 python tools/analysis_tools/report_accuracy.py --preds demo/fuse/joint.pkl demo/fuse/bone.pkl --coefficients 1.0 1.0
 ```
 
-```{note}
+```
 Mean Class Accuracy: 0.9180
 Top 1 Accuracy: 0.9333
 Top 5 Accuracy: 0.9833
diff --git a/docs/en/user_guides/finetune.md b/docs/en/user_guides/finetune.md
index a795a0b875..a41bcf3a49 100644
--- a/docs/en/user_guides/finetune.md
+++ b/docs/en/user_guides/finetune.md
@@ -1,9 +1,9 @@
-# Tutorial 2: Finetuning Models
+# Finetuning Models
 
 This tutorial provides instructions for users to use the pre-trained models
 to finetune them on other datasets, so that better performance can be achieved.
 
-- [Tutorial 2: Finetuning Models](#tutorial-2-finetuning-models)
+- [Finetuning Models](#finetuning-models)
   - [Outline](#outline)
   - [Choose Template Config](#choose-template-config)
   - [Modify Head](#modify-head)
@@ -325,6 +325,7 @@ Example: train the TSN model on Kinetics-400 dataset in a deterministic option.
 
 ```shell
 python tools/train.py configs/recognition/tsn/tsn_ucf101.py  \
+    --seed=0 --deterministic
 ```
 
 For more details, you can refer to the **Training** part in the [Training and Test Tutorial](train_test.md).
diff --git a/docs/en/user_guides/prepare_dataset.md b/docs/en/user_guides/prepare_dataset.md
index cd4225aaa0..c13d448106 100644
--- a/docs/en/user_guides/prepare_dataset.md
+++ b/docs/en/user_guides/prepare_dataset.md
@@ -24,7 +24,7 @@ To make video decoding faster, we support several efficient video loading librar
 
 ## Use built-in datasets
 
-MMAction2 already supports many datasets, we provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`, please refer to [supported datasets](../supported_datasets.md) for details to prepare specific datasets.
+MMAction2 already supports many datasets, we provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`, please refer to [supported datasets](../datasetzoo_satatistics.md) for details to prepare specific datasets.
 
 ## Use a custom dataset
 
@@ -112,7 +112,7 @@ The task recognizes the action class based on the skeleton sequence (time sequen
 
 - Build from RGB video data
 
-  You need to extract keypoints data from video and convert it to a supported format, we provide a [tutorial](/configs/skeleton/posec3d/custom_dataset_training.md) with detailed instructions.
+  You need to extract keypoints data from video and convert it to a supported format, we provide a [tutorial](https://github.com/open-mmlab/mmaction2/tree/main/configs/skeleton/posec3d/custom_dataset_training.md) with detailed instructions.
 
 - Build from existing keypoint data
 
diff --git a/docs/en/utils.py b/docs/en/utils.py
new file mode 100644
index 0000000000..a3ad213730
--- /dev/null
+++ b/docs/en/utils.py
@@ -0,0 +1,28 @@
+import re
+from pathlib import Path
+
+
+def replace_link(pattern, template, content, file_path):
+    MMACT_ROOT = Path(__file__).absolute().parents[2]
+    GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/'
+
+    def replace_core(matchobj):
+        name = matchobj.group(1)
+        link = matchobj.group(2)
+        if link.startswith('http') or link.startswith('#'):
+            return template.format(name, link)
+        # For link relative to project folder, such as '/configs/*/*.py'
+        elif Path(link).is_absolute():
+            link = link.lstrip('/')
+            folder = MMACT_ROOT
+        # For link relative to current file, such as './config/*.py'
+        else:
+            folder = file_path.parent
+        file_link = link.split('#')[0]
+        assert (folder / file_link).exists(), \
+            f'Link not found:\n{file_path}: {folder / link}'
+        rel_link = (folder / link).resolve().relative_to(MMACT_ROOT)
+        link = GITHUB_PREFIX + str(rel_link)
+        return template.format(name, link)
+
+    return re.sub(pattern, replace_core, content)
diff --git a/docs/zh_cn/_static/css/readthedocs.css b/docs/zh_cn/_static/css/readthedocs.css
index 07611c2b06..b2a56e8de4 100644
--- a/docs/zh_cn/_static/css/readthedocs.css
+++ b/docs/zh_cn/_static/css/readthedocs.css
@@ -4,3 +4,59 @@
     height: 40px;
     width: 130px;
 }
+
+@media screen and (min-width: 1100px) {
+    .header-logo {
+      top: -12px;
+    }
+  }
+
+  pre {
+      white-space: pre;
+  }
+
+  @media screen and (min-width: 2000px) {
+    .pytorch-content-left {
+      width: 1200px;
+      margin-left: 30px;
+    }
+    article.pytorch-article {
+      max-width: 1200px;
+    }
+    .pytorch-breadcrumbs-wrapper {
+      width: 1200px;
+    }
+    .pytorch-right-menu.scrolling-fixed {
+      position: fixed;
+      top: 45px;
+      left: 1580px;
+    }
+  }
+
+
+  article.pytorch-article section code {
+    padding: .2em .4em;
+    background-color: #f3f4f7;
+    border-radius: 5px;
+  }
+
+  /* Disable the change in tables */
+  article.pytorch-article section table code {
+    padding: unset;
+    background-color: unset;
+    border-radius: unset;
+  }
+
+  table.autosummary td {
+    width: 50%
+  }
+
+  img.align-center {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+  }
+
+  article.pytorch-article p.rubric {
+    font-weight: bold;
+  }
diff --git a/docs/zh_cn/_static/js/custom.js b/docs/zh_cn/_static/js/custom.js
new file mode 100644
index 0000000000..93edb6009e
--- /dev/null
+++ b/docs/zh_cn/_static/js/custom.js
@@ -0,0 +1,20 @@
+var collapsedSections = ['数据集支持'];
+
+$(document).ready(function () {
+  $('.model-summary').DataTable({
+    "stateSave": false,
+    "lengthChange": false,
+    "pageLength": 20,
+    "order": [],
+    "language": {
+      "info": "显示 _START_ 至 _END_ 条目（总计 _TOTAL_ ）",
+      "infoFiltered": "（筛选自 _MAX_ 条目）",
+      "search": "搜索：",
+      "zeroRecords": "没有找到任何条目",
+      "paginate": {
+        "next": "下一页",
+        "previous": "上一页"
+      },
+    }
+  });
+});
diff --git a/docs/zh_cn/_templates/404.html b/docs/zh_cn/_templates/404.html
new file mode 100644
index 0000000000..1e41a6217b
--- /dev/null
+++ b/docs/zh_cn/_templates/404.html
@@ -0,0 +1,16 @@
+{% extends "layout.html" %}
+
+{% block body %}
+
+<h1>未找到页面</h1>
+<p>
+  未找到你要打开的页面。
+</p>
+<p>
+  如果你是从旧版本文档跳转至此，可能是对应的页面被移动了。请从左侧的目录中寻找新版本文档，或者跳转至<a href="{{ pathto(root_doc) }}">首页</a>。
+</p>
+<p>
+  如果你找不到希望打开的文档，欢迎在 <a href="https://github.com/open-mmlab/mmaction2/issues/new/choose">Issue</a> 中告诉我们！
+</p>
+
+{% endblock %}
diff --git a/docs/zh_cn/advanced_guides/customize_dataset.md b/docs/zh_cn/advanced_guides/customize_dataset.md
new file mode 100644
index 0000000000..7749a484a3
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_dataset.md
@@ -0,0 +1,126 @@
+# 自定义数据集
+
+在本教程中，我们将介绍如何通过在线转换来自定义你的数据集。
+
+- [自定义数据集](#自定义数据集)
+  - [MMAction2 数据集概述](#mmaction2-数据集概述)
+  - [定制新的数据集](#定制新的数据集)
+  - [为 PoseDataset 自定义关键点格式](#为-posedataset-自定义关键点格式)
+
+## MMAction2 数据集概述
+
+MMAction2 提供了任务特定的 `Dataset` 类，例如用于动作识别的 `VideoDataset`/`RawframeDataset`，用于时空动作检测的 `AVADataset`，用于基于骨骼的动作识别的`PoseDataset`。这些任务特定的数据集只需要实现 `load_data_list(self)` 来从注释文件生成数据列表。剩下的函数由超类（即 `BaseActionDataset` 和 `BaseDataset`）自动处理。下表显示了模块的继承关系和主要方法。
+
+| 类名                           | 类方法                                                                                                                                                        |
+| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `MMAction2::VideoDataset`      | `load_data_list(self)` <br> 从注释文件中构建数据列表。                                                                                                        |
+| `MMAction2::BaseActionDataset` | `get_data_info(self, idx)` <br> 给定 `idx`，从数据列表中返回相应的数据样本。                                                                                  |
+| `MMEngine::BaseDataset`        | `__getitem__(self, idx)` <br> 给定 `idx`，调用 `get_data_info` 获取数据样本，然后调用 `pipeline` 在 `train_pipeline` 或 `val_pipeline` 中执行数据变换和增强。 |
+
+## 定制新的数据集类
+
+大多数情况下，把你的数据集离线转换成指定格式是首选方法，但 MMAction2 提供了一个方便的过程来创建一个定制的 `Dataset` 类。如前所述，任务特定的数据集只需要实现 `load_data_list(self)` 来从注释文件生成数据列表。请注意，`data_list` 中的元素是包含后续流程中必要字段的 `dict`。
+
+以 `VideoDataset` 为例，`train_pipeline`/`val_pipeline` 在 `DecordInit` 中需要 `'filename'`，在 `PackActionInputs` 中需要 `'label'`。因此，`data_list` 中的数据样本必须包含2个字段：`'filename'`和`'label'`。
+请参考[定制数据流水线](customize_pipeline.md)以获取有关 `pipeline` 的更多详细信息。
+
+```
+data_list.append(dict(filename=filename, label=label))
+```
+
+`AVADataset` 会更加复杂，`data_list` 中的数据样本包含有关视频数据的几个字段。此外，它重写了 `get_data_info(self, idx)` 以转换在时空动作检测数据流水线中需要用的字段。
+
+```python
+
+class AVADataset(BaseActionDataset):
+  ...
+
+   def load_data_list(self) -> List[dict]:
+      ...
+        video_info = dict(
+            frame_dir=frame_dir,
+            video_id=video_id,
+            timestamp=int(timestamp),
+            img_key=img_key,
+            shot_info=shot_info,
+            fps=self._FPS,
+            ann=ann)
+            data_list.append(video_info)
+        data_list.append(video_info)
+      return data_list
+
+  def get_data_info(self, idx: int) -> dict:
+      ...
+      ann = data_info.pop('ann')
+      data_info['gt_bboxes'] = ann['gt_bboxes']
+      data_info['gt_labels'] = ann['gt_labels']
+      data_info['entity_ids'] = ann['entity_ids']
+      return data_info
+```
+
+## 为 PoseDataset 自定义关键点格式
+
+MMAction2 目前支持三种关键点格式：`coco`，`nturgb+d` 和 `openpose`。如果你使用其中一种格式，你可以简单地在以下模块中指定相应的格式：
+
+对于图卷积网络，如 AAGCN，STGCN，...
+
+- `pipeline`：在 `JointToBone` 中的参数 `dataset`。
+- `backbone`：在图卷积网络中的参数 `graph_cfg`。
+
+对于 PoseC3D：
+
+- `pipeline`：在 `Flip` 中，根据关键点的对称关系指定 `left_kp` 和 `right_kp`。
+- `pipeline`：在 `GeneratePoseTarget` 中，如果 `with_limb` 为 `True`，指定`skeletons`，`left_limb`，`right_limb`，如果 `with_kp` 为 `True`，指定`left_kp` 和 `right_kp`。
+
+如果使用自定义关键点格式，需要在 `backbone` 和 `pipeline` 中都包含一个新的图布局。这个布局将定义关键点及其连接关系。
+
+以 `coco` 数据集为例，我们在 `Graph` 中定义了一个名为 `coco` 的布局。这个布局的 `inward` 连接包括所有节点连接，每个**向心**连接由一个节点元组组成。`coco`的额外设置包括将节点数指定为 `17`，将 `node 0` 设为中心节点。
+
+```python
+
+self.num_node = 17
+self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5),
+                (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0),
+                (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)]
+self.center = 0
+```
+
+同样，我们在 `JointToBone` 中定义了 `pairs`，添加了一个 bone `(0, 0)` 以使 bone 的数量对齐到 joint。coco数据集的 `pairs` 如下所示，`JointToBone` 中的 `pairs` 的顺序无关紧要。
+
+```python
+
+self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2),
+                (5, 0), (6, 0), (7, 5), (8, 6), (9, 7),
+                (10, 8), (11, 0), (12, 0), (13, 11), (14, 12),
+                (15, 13), (16, 14))
+```
+
+要使用你的自定义关键点格式，只需定义上述设置为你的图结构，并在你的配置文件中指定它们，如下所示。在这个例子中，我们将使用 `STGCN`，其中 `n` 表示类别的数量，`custom_dataset` 在 `Graph` 和 `JointToBone` 中定义。
+
+```python
+model = dict(
+  type='RecognizerGCN',
+  backbone=dict(
+      type='STGCN', graph_cfg=dict(layout='custom_dataset', mode='stgcn_spatial')),
+  cls_head=dict(type='GCNHead', num_classes=n, in_channels=256))
+
+train_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+val_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+test_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+```
+
+只需简单地指定自定义布局，你就可以使用你自己的关键点格式进行训练和测试了。通过这种方式，MMAction2 为用户提供了很大的灵活性，允许用户自定义他们的数据集和关键点格式，以满足他们特定的需求。
+
+以上就是关于如何自定义你的数据集的一些方法。希望这个教程能帮助你理解MMAction2的数据集结构，并教给你如何根据自己的需求创建新的数据集。虽然这可能需要一些编程知识，但是 MMAction2 试图使这个过程尽可能简单。通过了解这些基本概念，你将能够更好地控制你的数据，从而改进你的模型性能。
diff --git a/docs/zh_cn/advanced_guides/customize_logging.md b/docs/zh_cn/advanced_guides/customize_logging.md
new file mode 100644
index 0000000000..badb315cd1
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_logging.md
@@ -0,0 +1,163 @@
+# 自定义日志
+
+MMAction2 在运行过程中会产生大量的日志，如损失、迭代时间、学习率等。在这一部分，我们将向你介绍如何输出自定义日志。有关日志系统的更多详细信息，请参考 [MMEngine 教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/logging.html)。
+
+- [自定义日志](#自定义日志)
+  - [灵活的日志系统](#灵活的日志系统)
+  - [定制日志](#定制日志)
+  - [导出调试日志](#导出调试日志)
+
+## 灵活的日志系统
+
+默认情况下，MMAction2 的日志系统由 [default_runtime](/configs/_base_/default_runtime.py) 中的 `LogProcessor` 配置：
+
+```python
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+```
+
+默认情况下，`LogProcessor` 捕获 `model.forward` 返回的所有以 `loss` 开头的字段。例如，在以下模型中，`loss1` 和 `loss2` 将在没有任何额外配置的情况下自动记录到日志。
+
+```python
+from mmengine.model import BaseModel
+
+class ToyModel(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(1, 1)
+
+    def forward(self, img, label, mode):
+        feat = self.linear(img)
+        loss1 = (feat - label).pow(2)
+        loss2 = (feat - label).abs()
+        return dict(loss1=loss1, loss2=loss2)
+```
+
+输出日志遵循以下格式：
+
+```
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][10/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0019  data_time: 0.0004  loss1: 0.8381  loss2: 0.9007  loss: 1.7388
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][20/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0029  data_time: 0.0010  loss1: 0.1978  loss2: 0.4312  loss: 0.6290
+```
+
+`LogProcessor` 将按以下格式输出日志：
+
+- 日志的前缀：
+  - epoch 模式(`by_epoch=True`)：`Epoch(train) [{current_epoch}/{current_iteration}]/{dataloader_length}`
+  - iteration 模式(`by_epoch=False`)：`Iter(train) [{current_iteration}/{max_iteration}]`
+- 学习率 (`lr`)：最后一次迭代的学习率。
+- 时间：
+  - `time`：过去 `window_size` 次迭代的推理平均时间。
+  - `data_time`：过去 `window_size` 次迭代的数据加载平均时间。
+  - `eta`：完成训练的预计到达时间。
+- 损失：过去 `window_size` 次迭代中模型输出的平均损失。
+
+```{warning}
+默认情况下，log_processor 输出基于 epoch 的日志(`by_epoch=True`)。要得到与 `train_cfg` 匹配的预期日志，我们应在 `train_cfg` 和 `log_processor` 中设置相同的 `by_epoch` 值。
+```
+
+根据以上规则，代码片段将每20次迭代计算 loss1 和 loss2 的平均值。更多类型的统计方法，请参考 [mmengine.runner.LogProcessor](mmengine.runner.LogProcessor)。
+
+## 定制日志
+
+日志系统不仅可以记录 `loss`，`lr` 等，还可以收集和输出自定义日志。例如，如果我们想要统计中间损失：
+
+`ToyModel` 在 forward 中计算 `loss_tmp`，但不将其保存到返回字典中。
+
+```python
+from mmengine.logging import MessageHub
+
+class ToyModel(BaseModel):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(1, 1)
+
+    def forward(self, img, label, mode):
+        feat = self.linear(img)
+        loss_tmp = (feat - label).abs()
+        loss = loss_tmp.pow(2)
+
+        message_hub = MessageHub.get_current_instance()
+        # 在消息中心更新中间的 `loss_tmp`
+        message_hub.update_scalar('train/loss_tmp', loss_tmp.sum())
+        return dict(loss=loss)
+```
+
+将 `loss_tmp` 添加到配置中：
+
+```python
+log_processor = dict(
+    type='LogProcessor',
+    window_size=20,
+    by_epoch=True,
+    custom_cfg=[
+        # 使用平均值统计 loss_tmp
+            dict(
+                data_src='loss_tmp',
+                window_size=20,
+                method_name='mean')
+        ])
+```
+
+`loss_tmp` 将被添加到输出日志中：
+
+```
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][10/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0026  data_time: 0.0008  loss_tmp: 0.0097  loss: 0.0000
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][20/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0028  data_time: 0.0013  loss_tmp: 0.0065  loss: 0.0000
+```
+
+## 导出调试日志
+
+要将调试日志导出到 `work_dir`，你可以在配置文件中设置日志级别如下：
+
+```
+log_level='DEBUG'
+```
+
+```
+08/21 18:16:22 - mmengine - DEBUG - Get class `LocalVisBackend` from "vis_backend" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `LocalVisBackend` instance is built from registry, its implementation can be found in mmengine.visualization.vis_backend
+08/21 18:16:22 - mmengine - DEBUG - Get class `RuntimeInfoHook` from "hook" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `RuntimeInfoHook` instance is built from registry, its implementation can be found in mmengine.hooks.runtime_info_hook
+08/21 18:16:22 - mmengine - DEBUG - Get class `IterTimerHook` from "hook" registry in "mmengine"
+...
+```
+
+此外，如果你正在使用共享存储训练你的模型，那么在 `debug` 模式下，不同排名的日志将被保存。日志的层级结构如下：
+
+```text
+./tmp
+├── tmp.log
+├── tmp_rank1.log
+├── tmp_rank2.log
+├── tmp_rank3.log
+├── tmp_rank4.log
+├── tmp_rank5.log
+├── tmp_rank6.log
+└── tmp_rank7.log
+...
+└── tmp_rank63.log
+```
+
+在具有独立存储的多台机器上的日志：
+
+```text
+# 设备：0：
+work_dir/
+└── exp_name_logs
+    ├── exp_name.log
+    ├── exp_name_rank1.log
+    ├── exp_name_rank2.log
+    ├── exp_name_rank3.log
+    ...
+    └── exp_name_rank7.log
+
+# 设备：7：
+work_dir/
+└── exp_name_logs
+    ├── exp_name_rank56.log
+    ├── exp_name_rank57.log
+    ├── exp_name_rank58.log
+    ...
+    └── exp_name_rank63.log
+```
diff --git a/docs/zh_cn/advanced_guides/customize_models.md b/docs/zh_cn/advanced_guides/customize_models.md
new file mode 100644
index 0000000000..c64a6bb3d2
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_models.md
@@ -0,0 +1,3 @@
+# 自定义模型
+
+内容建设中...
diff --git a/docs/zh_cn/advanced_guides/customize_optimizer.md b/docs/zh_cn/advanced_guides/customize_optimizer.md
new file mode 100644
index 0000000000..5e4279d445
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_optimizer.md
@@ -0,0 +1,332 @@
+# 自定义优化器
+
+在本教程中，我们将介绍一些构建优化器和学习率策略的方法，以用于你的任务。
+
+- [自定义优化器](#自定义优化器)
+  - [使用 optim_wrapper 构建优化器](#使用-optim_wrapper-构建优化器)
+    - [使用 PyTorch 支持的优化器](#使用-pytorch-支持的优化器)
+    - [参数化精细配置](#参数化精细配置)
+    - [梯度裁剪](#梯度裁剪)
+    - [梯度累积](#梯度累积)
+  - [自定义参数策略](#自定义参数策略)
+    - [自定义学习率策略](#自定义学习率策略)
+    - [自定义动量策略](#自定义动量策略)
+  - [添加新的优化器或构造器](#添加新的优化器或构造器)
+    - [添加新的优化器](#添加新的优化器)
+      - [1. 实现一个新的优化器](#1-实现一个新的优化器)
+      - [2. 导入优化器](#2-导入优化器)
+      - [3. 在配置文件中指定优化器](#3-在配置文件中指定优化器)
+    - [添加新的优化器构造器](#添加新的优化器构造器)
+
+## 使用 optim_wrapper 构建优化器
+
+我们使用 `optim_wrapper` 字段来配置优化策略，其中包括选择优化器、参数逐个配置、梯度裁剪和梯度累积。一个简单的示例可以是：
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.0003, weight_decay=0.0001)
+)
+```
+
+在上面的示例中，我们构建了一个学习率为 0.0003，权重衰减为 0.0001 的 SGD 优化器。
+
+### 使用 PyTorch 支持的优化器
+
+我们支持 PyTorch 实现的所有优化器。要使用不同的优化器，只需更改配置文件中的 `optimizer` 字段。例如，如果想使用 `torch.optim.Adam`，可以在配置文件中进行如下修改。
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer = dict(
+        type='Adam',
+        lr=0.001,
+        betas=(0.9, 0.999),
+        eps=1e-08,
+        weight_decay=0,
+        amsgrad=False),
+)
+```
+
+首先，我们需要将 `type` 的值更改为 `torch.optim` 支持的期望优化器名称。然后，将该优化器的必要参数添加到 `optimizer` 字段中。上述配置将构建以下优化器：
+
+```python
+torch.optim.Adam(lr=0.001,
+                 betas=(0.9, 0.999),
+                 eps=1e-08,
+                 weight_decay=0,
+                 amsgrad=False)
+```
+
+### 参数化精细配置
+
+一些模型可能对优化有特定的参数设置，例如对于 BatchNorm 层不使用权重衰减，或者对不同网络层使用不同的学习率。为了对其进行细致配置，我们可以使用 `optim_wrapper` 中的 `paramwise_cfg` 参数。
+
+- **为不同类型的参数设置不同的超参数倍数。**
+
+  例如，我们可以在 `paramwise_cfg` 中设置 `norm_decay_mult=0.`，将归一化层的权重衰减设置为零。
+
+  ```python
+  optim_wrapper = dict(
+      optimizer=dict(type='SGD', lr=0.8, weight_decay=1e-4),
+      paramwise_cfg=dict(norm_decay_mult=0.))
+  ```
+
+  还支持设置其他类型的参数，包括：
+
+  - `lr_mult`：所有参数的学习率乘数。
+  - `decay_mult`：所有参数的权重衰减乘数。
+  - `bias_lr_mult`：偏置项的学习率乘数（不包括归一化层的偏置项和可变形卷积层的偏移量）。默认为 1。
+  - `bias_decay_mult`：偏置项的权重衰减乘数（不包括归一化层的偏置项和可变形卷积层的偏移量）。默认为 1。
+  - `norm_decay_mult`：归一化层权重和偏置项的权重衰减乘数。默认为 1。
+  - `dwconv_decay_mult`：深度卷积层的权重衰减乘数。默认为 1。
+  - `bypass_duplicate`：是否跳过重复的参数。默认为 `False`。
+  - `dcn_offset_lr_mult`：可变形卷积层的学习率乘数。默认为 1。
+
+- **为特定参数设置不同的超参数倍数。**
+
+  MMAction2 可以使用 `paramwise_cfg` 中的 `custom_keys` 来指定不同的参数使用不同的学习率或权重衰减。
+
+  例如，要将 `backbone.layer0` 的所有学习率和权重衰减设置为 0，而保持 `backbone` 的其余部分与优化器相同，并将 `head` 的学习率设置为 0.001，可以使用以下配置：
+
+  ```python
+  optim_wrapper = dict(
+      optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+      paramwise_cfg=dict(
+          custom_keys={
+              'backbone.layer0': dict(lr_mult=0, decay_mult=0),
+              'backbone': dict(lr_mult=1),
+              'head': dict(lr_mult=0.1)
+          }))
+  ```
+
+### 梯度裁剪
+
+在训练过程中，损失函数可能接近悬崖区域，导致梯度爆炸。梯度裁剪有助于稳定训练过程。梯度裁剪的更多介绍可以在[这个页面](https://paperswithcode.com/method/gradient-clipping)找到。
+
+目前，我们支持 `optim_wrapper` 中的 `clip_grad` 选项进行梯度裁剪，参考[PyTorch 文档](torch.nn.utils.clip_grad_norm_)。
+
+以下是一个示例：
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+    # norm_type: 使用的 p-范数的类型，这里 norm_type 为 2。
+    clip_grad=dict(max_norm=35, norm_type=2))
+```
+
+### 梯度累积
+
+当计算资源有限时，批量大小只能设置为较小的值，这可能会影响模型的性能。可以使用梯度累积来解决这个问题。我们支持 `optim_wrapper` 中的 `accumulative_counts` 选项进行梯度累积。
+
+以下是一个示例：
+
+```python
+train_dataloader = dict(batch_size=64)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+    accumulative_counts=4)
+```
+
+表示在训练过程中，每 4 个迭代执行一次反向传播。上述示例等价于：
+
+```python
+train_dataloader = dict(batch_size=256)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001))
+```
+
+## 新增优化器或者优化器构造器
+
+在训练中，优化参数（如学习率、动量等）通常不是固定的，而是随着迭代或周期的变化而变化。PyTorch 支持几种学习率策略，但对于复杂的策略可能不足够。在 MMAction2 中，我们提供 `param_scheduler` 来更好地控制不同参数的学习率策略。
+
+### 配置学习率调整策略
+
+调整学习率策略被广泛用于提高性能。我们支持大多数 PyTorch 学习率策略，包括 `ExponentialLR`、`LinearLR`、`StepLR`、`MultiStepLR` 等。
+
+所有可用的学习率策略可以在[这里](https://mmaction2.readthedocs.io/en/latest/schedulers.html)找到，学习率策略的名称以 `LR` 结尾。
+
+- **单一学习率策略**
+
+  在大多数情况下，我们只使用一个学习策略以简化问题。例如，`MultiStepLR` 被用作 ResNet 的默认学习率策略。在这里，`param_scheduler` 是一个字典。
+
+  ```python
+  param_scheduler = dict(
+      type='MultiStepLR',
+      by_epoch=True,
+      milestones=[100, 150],
+      gamma=0.1)
+  ```
+
+  或者，我们想使用 `CosineAnnealingLR` 策略来衰减学习率：
+
+  ```python
+  param_scheduler = dict(
+      type='CosineAnnealingLR',
+      by_epoch=True,
+      T_max=num_epochs)
+  ```
+
+- **多个学习率策略**
+
+  在某些训练案例中，为了提高准确性，会应用多个学习率策略。例如，在早期阶段，训练容易不稳定，预热是一种减少不稳定性的技术。学习率将从一个较小的值逐渐增加到预期值，通过预热进行衰减和其他策略进行衰减。
+
+  在 MMAction2 中，通过将所需的策略组合成 `param_scheduler` 的列表即可实现预热策略。
+
+  以下是一些示例：
+
+  1. 在前 50 个迭代中进行线性预热。
+
+  ```python
+    param_scheduler = [
+        # 线性预热
+        dict(type='LinearLR',
+            start_factor=0.001,
+            by_epoch=False,  # 按迭代
+            end=50),  # 仅在前 50 个迭代中进行预热
+        # 主要的学习率策略
+        dict(type='MultiStepLR',
+            by_epoch=True,
+            milestones=[8, 11],
+            gamma=0.1)
+    ]
+  ```
+
+  2. 在前 10 个周期中进行线性预热，并在每个周期内按迭代更新学习率。
+
+  ```python
+    param_scheduler = [
+        # 线性预热 [0, 10) 个周期
+        dict(type='LinearLR',
+            start_factor=0.001,
+            by_epoch=True,
+            end=10,
+            convert_to_iter_based=True,  # 按迭代更新学习率
+        ),
+        # 在 10 个周期后使用 CosineAnnealing 策略
+        dict(type='CosineAnnealingLR', by_epoch=True, begin=10)
+    ]
+  ```
+
+  注意，我们在这里使用 `begin` 和 `end` 参数来指定有效范围，该范围为 \[`begin`, `end`)。范围的单位由 `by_epoch` 参数定义。如果未指定，则 `begin` 为 0，`end` 为最大周期或迭代次数。
+
+  如果所有策略的范围都不连续，则学习率将在忽略的范围内保持不变，否则所有有效的策略将按特定阶段的顺序执行，这与 PyTorch [`ChainedScheduler`](torch.optim.lr_scheduler.ChainedScheduler) 的行为相同。
+
+### 自定义动量策略
+
+我们支持使用动量策略根据学习率修改优化器的动量，这可以使损失以更快的方式收敛。使用方法与学习率策略相同。
+
+所有可用的学习率策略可以在[这里](https://mmaction2.readthedocs.io/en/latest/schedulers.html)找到，动量策略的名称以 `Momentum` 结尾。
+
+以下是一个示例：
+
+```python
+param_scheduler = [
+    # 学习率策略
+    dict(type='LinearLR', ...),
+    # 动量策略
+    dict(type='LinearMomentum',
+         start_factor=0.001,
+         by_epoch=False,
+         begin=0,
+         end=1000)
+]
+```
+
+## 添加新的优化器或构造器
+
+本部分将修改 MMAction2 源代码或向 MMAction2 框架中添加代码，初学者可以跳过此部分。
+
+### 添加新的优化器
+
+在学术研究和工业实践中，可能需要使用 MMAction2 未实现的优化方法，可以通过以下方法进行添加。
+
+#### 1. 实现一个新的优化器
+
+假设要添加一个名为 `MyOptimizer` 的优化器，它具有参数 `a`、`b` 和 `c`。需要在 `mmaction/engine/optimizers` 下创建一个新文件，并在文件中实现新的优化器，例如在 `mmaction/engine/optimizers/my_optimizer.py` 中：
+
+```python
+from torch.optim import Optimizer
+from mmaction.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c):
+        ...
+
+    def step(self, closure=None):
+        ...
+```
+
+#### 2. 导入优化器
+
+为了找到上述定义的模块，需要在运行时导入该模块。首先，在 `mmaction/engine/optimizers/__init__.py` 中导入该模块，将其添加到 `mmaction.engine` 包中。
+
+```python
+# In mmaction/engine/optimizers/__init__.py
+...
+from .my_optimizer import MyOptimizer # MyOptimizer 可能是其他类名
+
+__all__ = [..., 'MyOptimizer']
+```
+
+在运行时，我们将自动导入 `mmaction.engine` 包，并同时注册 `MyOptimizer`。
+
+#### 3. 在配置文件中指定优化器
+
+然后，可以在配置文件的 `optim_wrapper.optimizer` 字段中使用 `MyOptimizer`。
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value))
+```
+
+### 添加新的优化器构造器
+
+一些模型可能对优化有一些特定的参数设置，例如所有 `BatchNorm` 层的不同权重衰减率。
+
+尽管我们已经可以使用[优化器教程](#参数化精细配置)中的 `optim_wrapper.paramwise_cfg` 字段来配置各种特定参数的优化器设置，但可能仍无法满足需求。
+
+当然，你可以修改它。默认情况下，我们使用 [`DefaultOptimWrapperConstructor`](mmengine.optim.DefaultOptimWrapperConstructor) 类来处理优化器的构造。在构造过程中，它根据 `paramwise_cfg` 对不同参数的优化器设置进行细致配置，这也可以作为新优化器构造器的模板。
+
+你可以通过添加新的优化器构造器来覆盖这些行为。
+
+```python
+# In mmaction/engine/optimizers/my_optim_constructor.py
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class MyOptimWrapperConstructor:
+
+    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
+        ...
+
+    def __call__(self, model):
+        ...
+```
+
+然后，导入它并几乎像[优化器教程](#添加新的优化器)中那样使用它。
+
+1. 在 `mmaction/engine/optimizers/__init__.py` 中导入它，将其添加到 `mmaction.engine` 包中。
+
+   ```python
+   # In mmaction/engine/optimizers/__init__.py
+   ...
+   from .my_optim_constructor import MyOptimWrapperConstructor
+
+   __all__ = [..., 'MyOptimWrapperConstructor']
+   ```
+
+2. 在配置文件的 `optim_wrapper.constructor` 字段中使用 `MyOptimWrapperConstructor`。
+
+   ```python
+   optim_wrapper = dict(
+       constructor=dict(type='MyOptimWrapperConstructor'),
+       optimizer=...,
+       paramwise_cfg=...,
+   )
+   ```
diff --git a/docs/zh_cn/advanced_guides/customize_pipeline.md b/docs/zh_cn/advanced_guides/customize_pipeline.md
new file mode 100644
index 0000000000..4327f96d23
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_pipeline.md
@@ -0,0 +1,144 @@
+# 自定义数据流水线
+
+在本教程中，我们将介绍如何为你的任务构建数据流水线（即，数据转换）的一些方法。
+
+- [自定义数据流水线](#自定义数据流水线)
+  - [数据流水线设计](#数据流水线设计)
+  - [修改训练/测试数据流水线](#修改训练/测试数据流水线)
+    - [加载](#加载)
+    - [采样帧和其他处理](#采样帧和其他处理)
+    - [格式化](#格式化)
+  - [添加新的数据转换](#添加新的数据转换)
+
+## 数据流水线设计
+
+数据流水线指的是从数据集索引样本时处理数据样本字典的过程，它包括一系列的数据转换。每个数据转换接受一个 `dict` 作为输入，对其进行处理，并产生一个 `dict` 作为输出，供序列中的后续数据转换使用。
+
+以下是一个例子，用于使用 `VideoDataset` 在 Kinetics 上训练 SlowFast 的数据流水线。这个数据流水线首先使用 [`decord`](https://github.com/dmlc/decord) 读取原始视频并随机采样一个视频剪辑，该剪辑包含 `32` 帧，帧间隔为 `2`。然后，它对所有帧应用随机大小调整的裁剪和随机水平翻转，然后将数据形状格式化为 `NCTHW`，在这个例子中，它是 `(1, 3, 32, 224, 224)`。
+
+```python
+train_pipeline = [
+    dict(type='DecordInit',),
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+MMAction2 中所有可用的数据转换的详细列表可以在 [mmaction.datasets.transforms](mmaction.datasets.transforms) 中找到。
+
+## 修改训练/测试数据流水线
+
+MMAction2 的数据流水线非常灵活，因为几乎每一步的数据预处理都可以从配置文件中进行配置。然而，对于一些用户来说，这种多样性可能会让人感到不知所措。
+
+以下是一些用于构建动作识别任务数据流水线的一般实践和指南。
+
+### 加载
+
+在数据流水线的开始，通常是加载视频。然而，如果帧已经被提取出来，你应该使用 `RawFrameDecode` 并修改数据集类型为 `RawframeDataset`。
+
+```python
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+如果你需要从具有不同格式（例如，`pkl`，`bin`等）的文件或从特定位置加载数据，你可以创建一个新的加载转换并将其包含在数据流水线的开始。有关更多详细信息，请参阅[添加新的数据转换](#添加新的数据转换)。
+
+### 采样帧和其他处理
+
+在训练和测试过程中，我们可能会有从视频中采样帧的不同策略。
+
+例如，当测试 SlowFast 时，我们会均匀地采样多个剪辑，如下所示：
+
+```python
+test_pipeline = [
+    ...
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    ...
+]
+```
+
+在上述例子中，每个视频将均匀地采样10个视频剪辑，每个剪辑包含32帧。 `test_mode=True` 用于实现这一点，与训练期间的随机采样相反。
+
+另一个例子涉及 `TSN/TSM` 模型，它们从视频中采样多个片段：
+
+```python
+train_pipeline = [
+    ...
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    ...
+]
+```
+
+通常，数据流水线中的数据增强只处理视频级的转换，例如调整大小或裁剪，而不处理像视频标准化或 mixup/cutmix 这样的转换。这是因为我们可以在批量视频数据上进行视频标准化和 mixup/cutmix，以使用 GPU 加速处理。要配置视频标准化和 mixup/cutmix，请使用 [mmaction.models.utils.data_preprocessor](mmaction.models.utils.data_preprocessor)。
+
+### 格式化
+
+格式化涉及从数据信息字典中收集训练数据，并将其转换为与模型兼容的格式。
+
+在大多数情况下，你可以简单地使用 [`PackActionInputs`](mmaction.datasets.transforms.PackActionInputs)，它将以 `NumPy Array` 格式的图像转换为 `PyTorch Tensor`，并将地面真实类别信息和其他元信息打包为一个类似字典的对象 [`ActionDataSample`](mmaction.structures.ActionDataSample)。
+
+```python
+train_pipeline = [
+    ...
+    dict(type='PackActionInputs'),
+]
+```
+
+## 添加新的数据转换
+
+1. 要创建一个新的数据转换，编写一个新的转换类在一个 Python 文件中，例如，名为 `my_transforms.py`。数据转换类必须继承 [`mmcv.transforms.BaseTransform`](mmcv.transforms.BaseTransform) 类，并重写 `transform` 方法，该方法接受一个 `dict` 作为输入并返回一个 `dict`。最后，将 `my_transforms.py` 放在 `mmaction/datasets/transforms/` 文件夹中。
+
+   ```python
+   from mmcv.transforms import BaseTransform
+   from mmaction.datasets import TRANSFORMS
+
+   @TRANSFORMS.register_module()
+   class MyTransform(BaseTransform):
+        def __init__(self, msg):
+            self.msg = msg
+
+       def transform(self, results):
+           # 修改数据信息字典 `results`。
+           print(msg, 'MMAction2.')
+           return results
+   ```
+
+2. 在 `mmaction/datasets/transforms/__init__.py` 中导入新类。
+
+   ```python
+   ...
+   from .my_transform import MyTransform
+
+   __all__ = [
+       ..., 'MyTransform'
+   ]
+   ```
+
+3. 在配置文件中使用它。
+
+   ```python
+   train_pipeline = [
+       ...
+       dict(type='MyTransform', msg='Hello!'),
+       ...
+   ]
+   ```
diff --git a/docs/zh_cn/advanced_guides/dataflow.md b/docs/zh_cn/advanced_guides/dataflow.md
new file mode 100644
index 0000000000..29db77f988
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/dataflow.md
@@ -0,0 +1,3 @@
+# MMAction2 的数据流
+
+内容建设中...
diff --git a/docs/zh_cn/advanced_guides/depoly.md b/docs/zh_cn/advanced_guides/depoly.md
new file mode 100644
index 0000000000..58e9f58ea4
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/depoly.md
@@ -0,0 +1,3 @@
+# How to deploy MMAction2 models
+
+coming soon...
diff --git a/docs/zh_cn/api.rst b/docs/zh_cn/api.rst
new file mode 100644
index 0000000000..4431c7734b
--- /dev/null
+++ b/docs/zh_cn/api.rst
@@ -0,0 +1,140 @@
+mmaction.apis
+--------------
+.. automodule:: mmaction.apis
+    :members:
+
+mmaction.datasets
+--------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmaction.datasets
+    :members:
+
+transforms
+^^^^^^^^^^^^
+.. automodule:: mmaction.datasets.transforms
+    :members:
+
+mmaction.engine
+--------------
+
+hooks
+^^^^^^^^^^
+.. automodule:: mmaction.engine.hooks
+    :members:
+
+optimizers
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.engine.optimizers
+    :members:
+
+runner
+^^^^^^^^^^
+.. automodule:: mmaction.engine.runner
+    :members:
+
+
+mmaction.evaluation
+--------------------
+
+functional
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.evaluation.functional
+    :members:
+
+metrics
+^^^^^^^^^^
+.. automodule:: mmaction.evaluation.metrics
+    :members:
+
+
+mmaction.models
+--------------
+
+backbones
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.backbones
+    :members:
+
+common
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.common
+    :members:
+
+data_preprocessors
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.data_preprocessors
+    :members:
+
+heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.heads
+    :members:
+
+localizers
+^^^^^^^^^^
+.. automodule:: mmaction.models.localizers
+    :members:
+
+
+losses
+^^^^^^^^^^
+.. automodule:: mmaction.models.losses
+    :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmaction.models.necks
+    :members:
+
+roi_heads
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.roi_heads
+    :members:
+
+recognizers
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.seg_heads
+    :members:
+
+task_modules
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.task_modules
+    :members:
+
+
+utils
+^^^^^^^^^^
+.. automodule:: mmaction.models.utils
+    :members:
+
+
+mmaction.structures
+--------------------
+
+structures
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.structures
+    :members:
+
+bbox
+^^^^^^^^^^
+.. automodule:: mmaction.structures.bbox
+    :members:
+
+
+mmaction.testing
+----------------
+.. automodule:: mmaction.testing
+    :members:
+
+mmaction.visualization
+--------------------
+.. automodule:: mmaction.visualization
+    :members:
+
+mmaction.utils
+--------------
+.. automodule:: mmaction.utils
+    :members:
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
index e0e68b6018..67e8821c61 100644
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@@ -17,7 +17,7 @@
 
 import pytorch_sphinx_theme
 
-sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath('../..'))
 
 # -- Project information -----------------------------------------------------
 
@@ -42,8 +42,16 @@ def get_version():
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode',
-    'sphinx_markdown_tables', 'sphinx_copybutton', 'myst_parser'
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'sphinx_copybutton',
+    'sphinx_tabs.tabs',
+    'notfound.extension',
+    'sphinxcontrib.jquery',
 ]
 
 # numpy and torch are required
@@ -90,45 +98,69 @@ def get_version():
         {
             'name':
             'Upstream',
-            'children': [
-                {
-                    'name': 'MMCV',
-                    'url': 'https://github.com/open-mmlab/mmcv',
-                    'description': 'Foundational library for computer vision'
-                },
-                {
-                    'name':
-                    'MMClassification',
-                    'url':
-                    'https://github.com/open-mmlab/mmclassification',
-                    'description':
-                    'Open source image classification toolbox based on PyTorch'
-                },
-                {
-                    'name': 'MMDetection',
-                    'url': 'https://github.com/open-mmlab/mmdetection',
-                    'description': 'Object detection toolbox and benchmark'
-                },
-            ]
+            'children': [{
+                'name':
+                'MMCV',
+                'url':
+                'https://github.com/open-mmlab/mmcv',
+                'description':
+                'Foundational library for computer vision'
+            }, {
+                'name':
+                'MMPreTrain',
+                'url':
+                'https://github.com/open-mmlab/mmpretrain',
+                'description':
+                'Open source pre-training toolbox based on PyTorch'
+            }, {
+                'name':
+                'MMDetection',
+                'url':
+                'https://github.com/open-mmlab/mmdetection',
+                'description':
+                'Object detection toolbox and benchmark'
+            }, {
+                'name':
+                'MMPose',
+                'url':
+                'https://github.com/open-mmlab/mmpose',
+                'description':
+                'Open-source toolbox for pose estimation based on PyTorch.'
+            }]
         },
     ],
     # Specify the language of shared menu
     'menu_lang':
-    'cn'
+    'en'
 }
 
-language = 'zh_CN'
+language = 'en'
 master_doc = 'index'
 
 html_static_path = ['_static']
-html_css_files = ['css/readthedocs.css']
+html_css_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
+    'css/readthedocs.css'
+]
+html_js_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
+    'js/custom.js'
+]
 
 myst_enable_extensions = ['colon_fence']
+myst_heading_anchors = 3
+
+# The not found page
+notfound_template = '404.html'
 
 
 def builder_inited_handler(app):
-    subprocess.run(['bash', './merge_docs.sh'])
-    subprocess.run(['python', './stat.py'])
+    if subprocess.run(['python', './stat.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `stat.py`.')
+    if subprocess.run(['python', './project_zoo.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `project_zoo.py`.')
+    if subprocess.run(['python', './dataset_zoo.py']).returncode != 0:
+        raise RuntimeError('Failed to run the script `dataset_zoo.py`.')
 
 
 def setup(app):
diff --git a/docs/zh_cn/dataset_zoo.py b/docs/zh_cn/dataset_zoo.py
new file mode 100644
index 0000000000..f9830f36cb
--- /dev/null
+++ b/docs/zh_cn/dataset_zoo.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+import re
+from pathlib import Path
+
+from utils import replace_link
+
+DATASETS_ROOT = Path('dataset_zoo')  # Path to save generated paper pages.
+DATASETZOO_TEMPLATE = """\
+# 数据集统计
+
+在本页面中，我们列举了我们支持的[所有数据集](#所有已支持的数据集)。你可以点击链接跳转至对应的数据集详情页面。
+
+## 所有已支持的数据集
+
+* 数据集数量：{num_datasets}
+{dataset_msg}
+
+"""  # noqa: E501
+
+
+def generate_datasets_pages():
+    dataset_list = Path('../../tools/data').glob('*/README.md')
+    num_datasets = 0
+    dataset_msgs = []
+
+    for file in dataset_list:
+        num_datasets += 1
+
+        copy = DATASETS_ROOT / file.parent.with_suffix('.md').name
+
+        title_template = r'^# Preparing (.*)'
+        # use chinese doc if exist
+        chinese_readme = Path(
+            str(file).replace('README.md', 'README_zh-CN.md'))
+        if chinese_readme.exists():
+            file = chinese_readme
+            title_template = r'^# 准备(.*)'
+        with open(file, 'r') as f:
+            content = f.read()
+
+        title = re.match(title_template, content).group(1)
+        title = title.lstrip(' ')
+        content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                               file)
+        content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                               file)
+        dataset_msgs.append(f'\t - [{title}]({copy})')
+
+        with open(copy, 'w') as f:
+            f.write(content)
+
+    dataset_msg = '\n'.join(dataset_msgs)
+
+    modelzoo = DATASETZOO_TEMPLATE.format(
+        num_datasets=num_datasets,
+        dataset_msg=dataset_msg,
+    )
+
+    with open('datasetzoo_statistics.md', 'w') as f:
+        f.write(modelzoo)
+
+
+DATASETS_ROOT.mkdir(exist_ok=True)
+generate_datasets_pages()
diff --git a/docs/zh_cn/docutils.conf b/docs/zh_cn/docutils.conf
new file mode 100644
index 0000000000..0c00c84688
--- /dev/null
+++ b/docs/zh_cn/docutils.conf
@@ -0,0 +1,2 @@
+[html writers]
+table_style: colwidths-auto
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
deleted file mode 100644
index d15e9d7b75..0000000000
--- a/docs/zh_cn/get_started.md
+++ /dev/null
@@ -1,184 +0,0 @@
-# 前置条件
-
-在本节中，我们将演示如何准备 PyTorch 相关的依赖环境。
-
-MMAction2 适用于 Linux、Windows 和 MacOS。它需要 Python 3.7+，CUDA 9.2+ 和 PyTorch 1.6+。
-
-```
-如果你对配置 PyTorch 环境已经很熟悉，并且已经完成了配置，可以直接进入[下一节](#安装)。
-否则的话，请依照以下步骤完成配置。
-```
-
-**第一步** 从[官网](https://docs.conda.io/en/latest/miniconda.html)下载并安装 Miniconda。
-
-**第二步** 创建一个 conda 虚拟环境并激活它。
-
-```shell
-conda create --name openmmlab python=3.8 -y
-conda activate openmmlab
-```
-
-**第三步** 根据[官方指南](https://pytorch.org/get-started/locally/)安装 PyTorch。例如：
-
-在GPU平台：
-
-```shell
-conda install pytorch torchvision -c pytorch
-```
-
-```
-以上命令将自动安装最新版本的 PyTorch 和 cudatoolkit,请检查它们是否和你的环境匹配。
-```
-
-在CPU平台：
-
-```shell
-conda install pytorch torchvision cpuonly -c pytorch
-```
-
-# 安装
-
-我们推荐用户按照我们的最佳实践安装 MMAction2。但除此之外，如果你想根据你的习惯完成安装，流程见[自定义安装](#自定义安装)章节获取更多信息。
-
-## 最佳实践
-
-**第一步** 使用 MIM 安装 MMEngine 和 MMCV。
-
-```shell
-pip install -U openmim
-mim install mmengine 'mmcv>=2.0.0'
-```
-
-**第二步** 安装 MMAction2。
-
-根据你的需要，我们支持两种安装模式：
-
-- [从源码安装（推荐）](#从源码安装)：希望开发自己的动作识别任务或者在 MMAction2 上开发新功能，例如，添加新的数据集或者新的模型。因此，你可以使用我们提供的所有工具。
-- [作为 Python 包安装](#作为-Python-包安装)：只想希望调用 MMAction2 的 API 接口，或者在你的项目中导入 MMAction2 中的模块。
-
-### 从源码安装
-
-这种情况下，从源码按如下方式安装 MMAction2：
-
-```shell
-git clone https://github.com/open-mmlab/mmaction2.git
-cd mmaction2
-git checkout 1.x
-pip install -v -e .
-# "-v" 表示输出更多安装相关的信息
-# "-e" 表示以可编辑形式安装，这样可以在不重新安装的情况下，让本地修改直接生效
-```
-
-另外，如果你想为 MMAction2 贡献代码，或者体验试验中的功能，请签出到 `dev-1.x` 分支。
-
-```shell
-git checkout dev-1.x
-```
-
-### 作为 Python 包安装
-
-直接使用 pip 安装即可。
-
-```shell
-pip install "mmaction2>=1.0.0"
-```
-
-## 验证安装
-
-为了验证 MMAction2 的安装是否正确，我们提供了一些示例代码来执行模型推理。
-
-**第一步**  我们需要下载配置文件和模型权重文件。
-
-```shell
-mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .
-```
-
-**第二步**  验证示例的推理流程。
-
-如果你从源码安装 MMAction2，那么直接运行以下命令进行验证：
-
-```shell
-# demo.mp4 和 label_map_k400.txt 都来自于 Kinetics-400
-python demo/demo.py tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py \
-    tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth \
-    demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
-```
-
-终端上将输出获得最高分数的标签以及相应的分数。
-
-如果你是作为 Python 包安装，那么可以打开你的 Python 解释器，并粘贴如下代码：
-
-```python
-from mmaction.apis import init_recognizer, inference_recognizer
-
-config_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'
-checkpoint_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth'
-video_file = 'demo/demo.mp4'
-label_file = 'tools/data/kinetics/label_map_k400.txt'
-model = init_recognizer(config_file, checkpoint_file, device='cpu')  # or device='cuda:0'
-result = inference_recognizer(model, video_file)
-pred_scores = result.pred_scores.item.tolist()
-score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))
-score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)
-top5_label = score_sorted[:5]
-
-labels = open(label_file).readlines()
-labels = [x.strip() for x in labels]
-results = [(labels[k[0]], k[1]) for k in top5_label]
-
-print('The top-5 labels with corresponding scores are:')
-for result in results:
-    print(f'{result[0]}: ', result[1])
-```
-
-## 自定义安装
-
-### CUDA 版本
-
-安装 PyTorch 时，你可能需要安装特定的 CUDA 的版本。如果你不清楚应该选择哪个版本，请遵循我们的建议：
-
-- 对于 Ampere 架构的 NVIDIA GPU，例如 GeForce 30 series 以及 NVIDIA A100，CUDA 11 是必需的。
-- 对于更早的 NVIDIA GPU，CUDA 11 是向前兼容的，但 CUDA 10.2 能够提供更好的兼容性，也更加轻量。
-
-请确保你的 GPU 驱动满足要求的最低版本，详见[此表格](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions)。
-
-```
-如果按照我们的最佳实践进行安装，CUDA 运行时库就足够了，因为我们提供相关 CUDA 代码的预编译，你不需要进行本地编译。
-但如果你希望从源码进行 MMCV 的编译，或是进行其他 CUDA 算子的开发，那么就必须安装完整的 CUDA 工具链，参见
-[NVIDIA 官网](https://developer.nvidia.com/cuda-downloads)，另外还需要确保该 CUDA 工具链的版本与 PyTorch 安装时
-的配置相匹配（如用 `conda install` 安装 PyTorch 时指定的 cudatoolkit 版本）。
-```
-
-### 不使用 MIM 安装 MMCV
-
-MMCV 包含 C++ 和 CUDA 扩展，因此其对 PyTorch 的依赖比较复杂。 MIM 会自动解析此类依赖关系，选择合适的 MMCV 预编译包，使安装更简单，但它并不是必需的。
-
-要使用 pip 而不是 MIM 安装 MMCV，请遵循 MMCV [安装指南](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html)。它需要你用指定 url 的形式手动指定对应的 PyTorch 和 CUDA 版本。
-
-例如，以下命令安装为 PyTorch 1.10.x 和 CUDA 11.3 构建的 mmcv。
-
-```shell
-pip install 'mmcv>=2.0.0' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
-```
-
-### 在 CPU 环境中安装
-
-MMAction2 可以仅在 CPU 环境中安装。在 CPU 模式下，你可以完成训练、测试和模型推理等所有操作。
-
-在 CPU 模式下，MMCV 的部分功能将不可用，通常是一些 GPU 编译的算子。不过不用担心， MMAction2 中几乎所有的模型都不会依赖这些算子。
-
-### 通过Docker使用MMAction2
-
-我们提供一个[Dockerfile](https://github.com/open-mmlab/mmaction2/blob/main/docker/Dockerfile)用来构建镜像，确保你的 [Docker版本](https://docs.docker.com/engine/install/)>=19.03.
-
-```shell
-# 例如构建PyTorch 1.6.0, CUDA 10.1, CUDNN 7的镜像
-# 如果你喜欢其他版本,只要修改Dockerfile
-docker build -f ./docker/Dockerfile --rm -t mmaction2 .
-```
-
-用以下命令运行 Docker 镜像：
-
-```shell
-docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmaction2/data mmaction2
-```
diff --git a/docs/zh_cn/get_started/contribution_guide.md b/docs/zh_cn/get_started/contribution_guide.md
new file mode 100644
index 0000000000..5eba1b58a0
--- /dev/null
+++ b/docs/zh_cn/get_started/contribution_guide.md
@@ -0,0 +1,62 @@
+# 参与贡献 MMACTION2
+
+欢迎各种形式的贡献，包括但不限于以下内容。
+
+- 修改拼写错误或代码错误
+- 新功能和组件
+- 添加文档或将文档翻译成其他语言
+- 添加关于视频理解算法的新项目（推荐），具体细节请参考[这里](../projectzoo.md)
+
+## 工作流程
+
+1. Fork 并拉取最新的 mmaction2
+2. 创建一个有意义的新分支（不要使用主分支进行 PR）
+3. 提交你的更改
+4. 创建一个 PR
+
+```{note}
+- 如果你计划添加一些涉及大规模更改的新功能，请首先打开一个 issue 进行讨论。
+- 如果你是论文的作者，并希望将你的方法包含在 mmaction2 中，请与我们联系。我们将非常感谢您的贡献。
+```
+
+## 代码风格
+
+### Python
+
+我们采用 [PEP8](https://www.python.org/dev/peps/pep-0008/) 作为首选代码风格。
+
+我们使用以下工具进行代码检查和格式化：
+
+- [flake8](http://flake8.pycqa.org/en/latest/)：检查器
+- [yapf](https://github.com/google/yapf)：格式化器
+- [isort](https://github.com/timothycrosley/isort)：排序导入
+- [codespell](https://github.com/codespell-project/codespell)：一个用于修复文本文件中常见拼写错误的 Python 工具。
+- [mdformat](https://github.com/executablebooks/mdformat)：Mdformat 是一个自由裁量的 Markdown 格式化工具，可用于强制执行一致的 Markdown 文件样式。
+- [docformatter](https://github.com/myint/docformatter)：一个格式化工具，用于格式化文档字符串。
+
+yapf 和 isort 的样式配置可以在 [setup.cfg](https://github.com/open-mmlab/mmaction2/blob/main/setup.cfg) 中找到。
+
+我们使用 [pre-commit hook](https://pre-commit.com/) 来保证每次提交时自动进行代码检查和格式化，启用的功能包括 `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, 修复 `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, 对 `requirments.txt`的排序等。
+预提交钩子的配置存储在 [.pre-commit-config](https://github.com/open-mmlab/mmaction2/blob/main/.pre-commit-config.yaml) 中。
+
+在克隆仓库后，你需要安装初始化的预提交钩子。
+
+```shell
+pip install -U pre-commit
+```
+
+从仓库文件夹中
+
+```shell
+pre-commit install
+```
+
+在此之后，每次提交，代码规范检查和格式化工具都将被强制执行。
+
+```{note}
+在创建 PR 之前，请确保你的代码通过了 lint 检查并由 yapf 进行了格式化。
+```
+
+### C++ 和 CUDA
+
+我们遵循 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)。
diff --git a/docs/zh_cn/get_started/faq.md b/docs/zh_cn/get_started/faq.md
new file mode 100644
index 0000000000..6c785f1124
--- /dev/null
+++ b/docs/zh_cn/get_started/faq.md
@@ -0,0 +1,125 @@
+# 常见问题解答
+
+## 概述
+
+我们在这里列出了许多用户常遇到的问题以及相应的解决方案。
+
+- [常见问题解答](#常见问题解答)
+  - [概述](#概述)
+  - [安装](#安装)
+  - [数据](#数据)
+  - [训练](#训练)
+  - [测试](#测试)
+
+如果您发现任何频繁出现的问题并且有解决方法，欢迎在列表中补充。如果这里的内容没有涵盖您的问题，请使用[提供的模板](https://github.com/open-mmlab/mmaction2/tree/main/.github/ISSUE_TEMPLATE/error-report.md)创建一个问题，并确保在模板中填写所有必要的信息。
+
+## 安装
+
+- **"No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"**
+
+  1. 使用 `pip uninstall mmcv` 命令卸载环境中的现有 mmcv。
+  2. 参照[安装说明](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html#install-mmcv)安装 mmcv。
+
+- **"OSError: MoviePy Error: creation of None failed because of the following error"**
+
+  使用 `pip install moviepy` 安装。更多信息可以参考[官方安装文档](https://zulko.github.io/moviepy/install.html), 请注意（根据这个 [issue](https://github.com/Zulko/moviepy/issues/693)）：
+
+  1. 对于 Windows 用户，[ImageMagick](https://www.imagemagick.org/script/index.php) 不会自动被 MoviePy 检测到，需要修改 `moviepy/config_defaults.py` 文件，提供 ImageMagick 二进制文件 `magick` 的路径，例如 `IMAGEMAGICK_BINARY = "C:\\Program Files\\ImageMagick_VERSION\\magick.exe"`
+  2. 对于 Linux 用户，如果 MoviePy 没有检测到 ImageMagick，需要修改 `/etc/ImageMagick-6/policy.xml` 文件，将 `<policy domain="path" rights="none" pattern="@*" />` 注释掉，改为 `<!-- <policy domain="path" rights="none" pattern="@*" /> -->`。
+
+- **"即使我已经安装了 XXCODEBASE，为什么还会收到 'Please install XXCODEBASE to use XXX' 的错误消息?"**
+
+  您收到该错误消息是因为我们的项目无法从 XXCODEBASE 中导入一个函数或类。您可以尝试运行相应的代码行来查看发生了什么。一个可能的原因是，在 OpenMMLAB 的某些代码库中，您需要在安装它们之前先安装 mmcv 和 mmengine。您可以按照[教程](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html#installation)来安装它们。
+
+## 数据
+
+- **FileNotFound 错误，例如 `No such file or directory: xxx/xxx/img_00300.jpg`**
+
+  在我们的仓库中，我们将 `start_index=1` 设置为 rawframe 数据集的默认值，将 `start_index=0` 设置为视频数据集的默认值。如果用户遇到数据的第一帧或最后一帧的 FileNotFound 错误，需要检查以 0 或 1 作为偏移量开始的文件，例如 `xxx_00000.jpg` 或 `xxx_00001.jpg`，然后在配置文件中更改数据处理流水线的 `start_index` 值。
+
+- **我们应该如何预处理数据集中的视频？将它们调整为固定大小（所有视频的高宽比相同），例如 `340x256`（1），还是调整它们使得所有视频的短边具有相同的长度（256px 或 320px）（2）？**
+
+  我们尝试过这两种预处理方法，并发现（2）通常是更好的解决方案，因此我们使用（2）作为默认的预处理设置，短边长度为 256px。我们对这些预处理方法进行了基准测试，您可以在[TSN 数据基准测试](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn)和[SlowOnly 数据基准测试](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/slowonly)中找到结果。
+
+- **数据处理流水线中的项不匹配导致出现类似 `KeyError: 'total_frames'` 的错误**
+
+  我们有用于处理视频和帧的两个处理流水线。
+
+  **对于视频**，我们应该在处理流水线中动态解码视频，所以在这种情况下应该使用 `DecordInit & DecordDecode`、`OpenCVInit & OpenCVDecode` 或 `PyAVInit & PyAVDecode` 这样的配对，例如[这个示例](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py#L14-L16)。
+
+  **对于帧**，图像已经在离线状态下解码，所以在这种情况下应该使用 `RawFrameDecode` 这样的处理流水线项，例如[这个示例](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py#L17)。
+
+  `KeyError: 'total_frames'` 是由于错误地将 `RawFrameDecode` 步骤用于视频，因为当输入是视频时，无法预先获取 `total_frames`。
+
+## 训练
+
+- **如何只使用训练好的识别模型进行主干网络的预训练？**
+
+  为了使用预训练模型进行整个网络的训练，新的配置文件在 `load_from` 中添加了预训练模型的链接。
+
+  要使用主干进行预训练，可以将配置文件中主干部分的 `pretrained` 值更改为权重路径/URL。在训练时，未预料到的键将被忽略。
+
+- **在微调模型时如何固定主干的某些阶段？**
+
+  您可以参考 [`def _freeze_stages()`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L791) 和 [`frozen_stages`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/models/backbones/resnet3d.py#L369-L370)。
+  提醒在配置文件中设置 `find_unused_parameters = True`，以进行分布式训练或测试。
+
+  实际上，除了少数模型，如 C3D 等，用户可以设置 `frozen_stages` 来冻结主干的阶段，因为几乎所有继承自 `ResNet` 和 `ResNet3D` 的主干都支持内部函数 `_freeze_stages()`。
+
+- **如何在配置文件中设置 memcached ？**
+
+  在 MMAction2 中，您可以将 memcached 的参数传递给用于视频数据集的 `class DecordInit` 或用于原始帧数据集的 `RawFrameDecode`。有关更多细节，请参阅 MMEngine 中的 [`class FileClient`](https://github.com/open-mmlab/mmaction2/blob/main/mmaction/data/pipelines/file_client.py)。以下是一个示例，演示如何在原始帧数据集中使用 memcached：
+
+  ```python
+  mc_cfg = dict(server_list_cfg='server_list_cfg', client_cfg='client_cfg', sys_path='sys_path')
+
+  train_pipeline = [
+    ...
+    dict(type='RawFrameDecode', io_backend='memcached', **mc_cfg),
+    ...
+  ]
+  ```
+
+- **如何在配置文件中设置 `load_from` 的值以微调模型？**
+
+  在 MMAction2 中，我们将 `load_from=None` 设置为 `configs/_base_/default_runtime.py` 中的默认值，并且由于[继承设计](https://github.com/open-mmlab/mmaction2/tree/main/docs/en/user_guides/config.md)，用户可以直接通过在其配置文件中设置 `load_from` 来更改它。
+
+- **如何在训练时使用 `RawFrameDataset`？**
+
+  在 MMAction2 1.x 版本中，大多数配置文件默认使用 `VideoDataset` 作为数据集类型，这对于文件存储更加友好。如果您想使用 `RawFrameDataset`，需要进行两个修改步骤：
+
+  - `dataset` 相关：
+    将 `train_dataloader`/`val_dataloader`/`test_dataloader` 中的 `dataset` 从
+
+    ```
+    dataset=dict(
+        type=VideoDataset,
+        data_prefix=dict(video=xxx),
+        ...)
+    ```
+
+    修改为
+
+    ```
+    dataset=dict(
+        type=RawFrameDataset,
+        data_prefix=dict(img=xxx),
+        filename_tmpl='{:05}.jpg',
+        ...)
+    ```
+
+    数据集的其他字段不需要修改。请确保 `filename_tmpl` 与帧数据匹配，并参考[配置文件文档](../user_guides/config.md)了解更多关于配置文件的详细信息。
+
+  - `transform` 相关：在 `train_pipeline`/`val_pipeline`/`test_pipeline` 中删除 `dict(type='DecordInit', **file_client_args)`，将 `dict(type='DecordDecode')` 修改为 `dict(type='RawFrameDecode', **file_client_args)`，并确保在配置文件中定义了 `file_client_args = dict(io_backend='disk')`。
+
+  有关自定义数据集的更多修改，请参考[准备数据集](../user_guides/prepare_dataset.md)和[自定义数据集](../advanced_guides/customize_dataset.md)。
+
+## 测试
+
+- **如何使预测得分在 softmax 内归一化到 \[0, 1\] ?**
+
+  在配置文件中将 `model.cls_head.average_clips` 设置为 `'prob'`。
+
+- **如果模型过大，GPU 内存无法容纳甚至只有一个测试样本怎么办？**
+
+  默认情况下，3D 模型使用 10 个 clips x 3 个 crops 进行测试，总共有 30 个视图。对于非常大的模型，即使只有一个测试样本，GPU 内存也无法容纳（因为有 30 个视图）。为了解决这个问题，您可以在配置文件的 `model['test_cfg']` 中设置 `max_testing_views=n`。这样，在前向传播过程中，会使用 n 个视图作为一个批次，以节省 GPU 内存的使用。
diff --git a/docs/zh_cn/get_started/guide_to_framework.md b/docs/zh_cn/get_started/guide_to_framework.md
new file mode 100644
index 0000000000..b92c376b5d
--- /dev/null
+++ b/docs/zh_cn/get_started/guide_to_framework.md
@@ -0,0 +1,761 @@
+# 20分钟了解 MMAction2 框架设计
+
+在本教程中，我们将通过一个视频动作识别的手把手教程来演示 `MMACTION2 1.0` 的整体架构。
+
+本教程的目录如下:
+
+- [20分钟了解 MMAction2 框架设计](#20分钟了解-mmaction2-框架设计)
+  - [步骤0：准备数据](#步骤0准备数据)
+  - [步骤1：构建一个数据流水线](#步骤1构建一个数据流水线)
+  - [步骤2：构建一个数据集和数据加载器](#步骤2构建一个数据集和数据加载器)
+  - [步骤3：构建一个识别器](#步骤3构建一个识别器)
+  - [步骤4：构建一个评估指标](#步骤4构建一个评估指标)
+  - [步骤5：使用本地 PyTorch 训练和测试](#步骤5使用本地-pytorch-训练和测试)
+  - [步骤6：使用 MMEngine 训练和测试（推荐）](#步骤6使用-mmengine-训练和测试推荐)
+
+首先，我们需要初始化注册表的 `scope` ，以确保每个模块都在 `mmaction` 范围下注册。有关注册表的更多详细信息，请参考[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html) 。
+
+```python
+from mmaction.utils import register_all_modules
+
+register_all_modules(init_default_scope=True)
+```
+
+## 步骤0：准备数据
+
+请下载我们准备的[精简版 kinetics400](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) 数据集，并将其提取到 `$MMACTION2/data` 目录。
+
+解压后的目录结构应如下所示:
+
+```
+mmaction2
+├── data
+│   ├── kinetics400_tiny
+│   │    ├── kinetics_tiny_train_video.txt
+│   │    ├── kinetics_tiny_val_video.txt
+│   │    ├── train
+│   │    │   ├── 27_CSXByd3s.mp4
+│   │    │   ├── 34XczvTaRiI.mp4
+│   │    │   ├── A-wiliK50Zw.mp4
+│   │    │   ├── ...
+│   │    └── val
+│   │       ├── 0pVGiAU6XEA.mp4
+│   │       ├── AQrbRSnRt8M.mp4
+│   │       ├── ...
+```
+
+以下是标注文件 `kinetics_tiny_train_video.txt` 中的一些示例:
+
+```
+D32_1gwq35E.mp4 0
+iRuyZSKhHRg.mp4 1
+oXy-e_P_cAI.mp4 0
+34XczvTaRiI.mp4 1
+h2YqqUhnR34.mp4 0
+```
+
+文件中的每一行表示每一个视频的标注，其中第一项表示视频文件名(如 `D32_1gwq35E.mp4` )，第二项表示相应的标签(如 `D32_1gwq35E.mp4` 的标签是 `0` )。在这个数据集中，只有 `两个` 类别。
+
+## 步骤1：构建一个数据流水线
+
+为了实现 `解码`、`采样`、`调整大小`、`裁剪`、`格式化` 和 `打包` 视频数据和相应的标签，我们需要设计一个数据流水线来处理这些过程。具体来说，我们设计了7个 `Transform` 类来构建这个视频处理流水线。注意，OpenMMLab 中的所有`Transform` 类都必须继承自 `mmcv` 中的 `BaseTransform` 类，实现抽象方法 `transform`，并注册到 `TRANSFORMS` 注册表。有关数据转换的更多详细信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_transform.html) 。
+
+```python
+import mmcv
+import decord
+import numpy as np
+from mmcv.transforms import TRANSFORMS, BaseTransform, to_tensor
+from mmaction.structures import ActionDataSample
+
+
+@TRANSFORMS.register_module()
+class VideoInit(BaseTransform):
+    def transform(self, results):
+        container = decord.VideoReader(results['filename'])
+        results['total_frames'] = len(container)
+        results['video_reader'] = container
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoSample(BaseTransform):
+    def __init__(self, clip_len, num_clips, test_mode=False):
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+
+    def transform(self, results):
+        total_frames = results['total_frames']
+        interval = total_frames // self.clip_len
+
+        if self.test_mode:
+            # 使测试期间的采样具有确定性
+            np.random.seed(42)
+
+        inds_of_all_clips = []
+        for i in range(self.num_clips):
+            bids = np.arange(self.clip_len) * interval
+            offset = np.random.randint(interval, size=bids.shape)
+            inds = bids + offset
+            inds_of_all_clips.append(inds)
+
+        results['frame_inds'] = np.concatenate(inds_of_all_clips)
+        results['clip_len'] = self.clip_len
+        results['num_clips'] = self.num_clips
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoDecode(BaseTransform):
+    def transform(self, results):
+        frame_inds = results['frame_inds']
+        container = results['video_reader']
+
+        imgs = container.get_batch(frame_inds).asnumpy()
+        imgs = list(imgs)
+
+        results['video_reader'] = None
+        del container
+
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoResize(BaseTransform):
+    def __init__(self, r_size):
+        self.r_size = (np.inf, r_size)
+
+    def transform(self, results):
+        img_h, img_w = results['img_shape']
+        new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size)
+
+        imgs = [mmcv.imresize(img, (new_w, new_h))
+                for img in results['imgs']]
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoCrop(BaseTransform):
+    def __init__(self, c_size):
+        self.c_size = c_size
+
+    def transform(self, results):
+        img_h, img_w = results['img_shape']
+        center_x, center_y = img_w // 2, img_h // 2
+        x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2
+        y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2
+        imgs = [img[y1:y2, x1:x2] for img in results['imgs']]
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoFormat(BaseTransform):
+    def transform(self, results):
+        num_clips = results['num_clips']
+        clip_len = results['clip_len']
+        imgs = results['imgs']
+
+        # [num_clips*clip_len, H, W, C]
+        imgs = np.array(imgs)
+        # [num_clips, clip_len, H, W, C]
+        imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:])
+        # [num_clips, C, clip_len, H, W]
+        imgs = imgs.transpose(0, 4, 1, 2, 3)
+
+        results['imgs'] = imgs
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoPack(BaseTransform):
+    def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results):
+        packed_results = dict()
+        inputs = to_tensor(results['imgs'])
+        data_sample = ActionDataSample().set_gt_labels(results['label'])
+        metainfo = {k: results[k] for k in self.meta_keys if k in results}
+        data_sample.set_metainfo(metainfo)
+        packed_results['inputs'] = inputs
+        packed_results['data_samples'] = data_sample
+        return packed_results
+```
+
+下面，我们提供了一个代码片段(使用标注文件中的 `D32_1gwq35E.mp4 0` )来演示如何使用数据流水线。
+
+```python
+import os.path as osp
+from mmengine.dataset import Compose
+
+pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+pipeline = Compose(pipeline_cfg)
+data_prefix = 'data/kinetics400_tiny/train'
+results = dict(filename=osp.join(data_prefix, 'D32_1gwq35E.mp4'), label=0)
+packed_results = pipeline(results)
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# 获取输入的信息
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# 获取输入的标签
+print('label: ', data_sample.gt_labels.item)
+```
+
+```
+shape of the inputs:  torch.Size([1, 3, 16, 224, 224])
+image_shape:  (224, 224)
+num_clips:  1
+clip_len:  16
+label:  tensor([0])
+```
+
+## 步骤2：构建一个数据集和数据加载器
+
+OpenMMLab中的所有 `Dataset` 类都必须继承自 `mmengine` 中的 `BaseDataset` 类。我们可以通过覆盖 `load_data_list` 方法来定制注释加载过程。此外，我们可以通过覆盖 `get_data_info` 方法，向 `results` 字典添加更多字段，它将作为输入传给 `pipeline` 。有关 `BaseDataset` 类的更多详细信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) 。
+
+```python
+import os.path as osp
+from mmengine.fileio import list_from_file
+from mmengine.dataset import BaseDataset
+from mmaction.registry import DATASETS
+
+
+@DATASETS.register_module()
+class DatasetZelda(BaseDataset):
+    def __init__(self, ann_file, pipeline, data_root, data_prefix=dict(video=''),
+                 test_mode=False, modality='RGB', **kwargs):
+        self.modality = modality
+        super(DatasetZelda, self).__init__(ann_file=ann_file, pipeline=pipeline, data_root=data_root,
+                                           data_prefix=data_prefix, test_mode=test_mode,
+                                           **kwargs)
+
+    def load_data_list(self):
+        data_list = []
+        fin = list_from_file(self.ann_file)
+        for line in fin:
+            line_split = line.strip().split()
+            filename, label = line_split
+            label = int(label)
+            filename = osp.join(self.data_prefix['video'], filename)
+            data_list.append(dict(filename=filename, label=label))
+        return data_list
+
+    def get_data_info(self, idx: int) -> dict:
+        data_info = super().get_data_info(idx)
+        data_info['modality'] = self.modality
+        return data_info
+```
+
+接下来，我们将演示如何使用 dataset 和 dataloader 来索引数据。我们将使用 `Runner.build_dataloader` 方法来构造 dataloader。有关 dataloader 的更多详细信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/tutorials/dataset.html#details-on-dataloader) 。
+
+```python
+from mmaction.registry import DATASETS
+
+train_pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+val_pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=5, test_mode=True),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+train_dataset_cfg = dict(
+    type='DatasetZelda',
+    ann_file='kinetics_tiny_train_video.txt',
+    pipeline=train_pipeline_cfg,
+    data_root='data/kinetics400_tiny/',
+    data_prefix=dict(video='train'))
+
+val_dataset_cfg = dict(
+    type='DatasetZelda',
+    ann_file='kinetics_tiny_val_video.txt',
+    pipeline=val_pipeline_cfg,
+    data_root='data/kinetics400_tiny/',
+    data_prefix=dict(video='val'))
+
+train_dataset = DATASETS.build(train_dataset_cfg)
+
+packed_results = train_dataset[0]
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# 获取输入的信息
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# 获取输入的标签
+print('label: ', data_sample.gt_labels.item)
+
+from mmengine.runner import Runner
+
+BATCH_SIZE = 2
+
+train_dataloader_cfg = dict(
+    batch_size=BATCH_SIZE,
+    num_workers=0,
+    persistent_workers=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=train_dataset_cfg)
+
+val_dataloader_cfg = dict(
+    batch_size=BATCH_SIZE,
+    num_workers=0,
+    persistent_workers=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=val_dataset_cfg)
+
+train_data_loader = Runner.build_dataloader(dataloader=train_dataloader_cfg)
+val_data_loader = Runner.build_dataloader(dataloader=val_dataloader_cfg)
+
+batched_packed_results = next(iter(train_data_loader))
+
+batched_inputs = batched_packed_results['inputs']
+batched_data_sample = batched_packed_results['data_samples']
+
+assert len(batched_inputs) == BATCH_SIZE
+assert len(batched_data_sample) == BATCH_SIZE
+```
+
+终端输出应该与[步骤1：构建一个数据流水线](#步骤1：构建一个数据流水线)中的输出相同。
+
+## 步骤3：构建一个识别器
+
+接下来，我们将构建 `recognizer`，它主要由三部分组成：用于批处理和规范化数据的 `data preprocessor`，用于特征提取的 `backbone` 和用于分类的 `cls_head` 。
+
+`data_preprocessor` 的实现如下:
+
+```python
+import torch
+from mmengine.model import BaseDataPreprocessor, stack_batch
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class DataPreprocessorZelda(BaseDataPreprocessor):
+    def __init__(self, mean, std):
+        super().__init__()
+
+        self.register_buffer(
+            'mean',
+            torch.tensor(mean, dtype=torch.float32).view(-1, 1, 1, 1),
+            False)
+        self.register_buffer(
+            'std',
+            torch.tensor(std, dtype=torch.float32).view(-1, 1, 1, 1),
+            False)
+
+    def forward(self, data, training=False):
+        data = self.cast_data(data)
+        inputs = data['inputs']
+        batch_inputs = stack_batch(inputs)  # 批处理
+        batch_inputs = (batch_inputs - self.mean) / self.std  # 归一化
+        data['inputs'] = batch_inputs
+        return data
+```
+
+以下是 data_preprocessor 的用法：将从[步骤2：构建一个数据集和数据加载器](#步骤2：构建一个数据集和数据加载器)中获得的 `batched_packed_results` 提供给 `data_preprocessor` 进行批处理和归一化。
+
+```python
+from mmaction.registry import MODELS
+
+data_preprocessor_cfg = dict(
+    type='DataPreprocessorZelda',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375])
+
+data_preprocessor = MODELS.build(data_preprocessor_cfg)
+
+preprocessed_inputs = data_preprocessor(batched_packed_results)
+print(preprocessed_inputs['inputs'].shape)
+```
+
+```
+torch.Size([2, 1, 3, 16, 224, 224])
+```
+
+`backbone`、`cls_head` 和 `recognizer` 的实现如下:
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModel, BaseModule, Sequential
+from mmengine.structures import LabelData
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class BackBoneZelda(BaseModule):
+    def __init__(self, init_cfg=None):
+        if init_cfg is None:
+            init_cfg = [dict(type='Kaiming', layer='Conv3d', mode='fan_out', nonlinearity="relu"),
+                        dict(type='Constant', layer='BatchNorm3d', val=1, bias=0)]
+
+        super(BackBoneZelda, self).__init__(init_cfg=init_cfg)
+
+        self.conv1 = Sequential(nn.Conv3d(3, 64, kernel_size=(3, 7, 7),
+                                          stride=(1, 2, 2), padding=(1, 3, 3)),
+                                nn.BatchNorm3d(64), nn.ReLU())
+        self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+                                    padding=(0, 1, 1))
+
+        self.conv = Sequential(nn.Conv3d(64, 128, kernel_size=3, stride=2, padding=1),
+                               nn.BatchNorm3d(128), nn.ReLU())
+
+    def forward(self, imgs):
+        # imgs: [batch_size*num_views, 3, T, H, W]
+        # features: [batch_size*num_views, 128, T/2, H//8, W//8]
+        features = self.conv(self.maxpool(self.conv1(imgs)))
+        return features
+
+
+@MODELS.register_module()
+class ClsHeadZelda(BaseModule):
+    def __init__(self, num_classes, in_channels, dropout=0.5, average_clips='prob', init_cfg=None):
+        if init_cfg is None:
+            init_cfg = dict(type='Normal', layer='Linear', std=0.01)
+
+        super(ClsHeadZelda, self).__init__(init_cfg=init_cfg)
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.average_clips = average_clips
+
+        if dropout != 0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+        self.fc = nn.Linear(self.in_channels, self.num_classes)
+        self.pool = nn.AdaptiveAvgPool3d(1)
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, x):
+        N, C, T, H, W = x.shape
+        x = self.pool(x)
+        x = x.view(N, C)
+        assert x.shape[1] == self.in_channels
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        cls_scores = self.fc(x)
+        return cls_scores
+
+    def loss(self, feats, data_samples):
+        cls_scores = self(feats)
+        labels = torch.stack([x.gt_labels.item for x in data_samples])
+        labels = labels.squeeze()
+
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+
+        loss_cls = self.loss_fn(cls_scores, labels)
+        return dict(loss_cls=loss_cls)
+
+    def predict(self, feats, data_samples):
+        cls_scores = self(feats)
+        num_views = cls_scores.shape[0] // len(data_samples)
+        # assert num_views == data_samples[0].num_clips
+        cls_scores = self.average_clip(cls_scores, num_views)
+
+        for ds, sc in zip(data_samples, cls_scores):
+            pred = LabelData(item=sc)
+            ds.pred_scores = pred
+        return data_samples
+
+    def average_clip(self, cls_scores, num_views):
+          if self.average_clips not in ['score', 'prob', None]:
+            raise ValueError(f'{self.average_clips} is not supported. '
+                             f'Currently supported ones are '
+                             f'["score", "prob", None]')
+
+          total_views = cls_scores.shape[0]
+          cls_scores = cls_scores.view(total_views // num_views, num_views, -1)
+
+          if self.average_clips is None:
+              return cls_scores
+          elif self.average_clips == 'prob':
+              cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1)
+          elif self.average_clips == 'score':
+              cls_scores = cls_scores.mean(dim=1)
+
+          return cls_scores
+
+
+@MODELS.register_module()
+class RecognizerZelda(BaseModel):
+    def __init__(self, backbone, cls_head, data_preprocessor):
+        super().__init__(data_preprocessor=data_preprocessor)
+
+        self.backbone = MODELS.build(backbone)
+        self.cls_head = MODELS.build(cls_head)
+
+    def extract_feat(self, inputs):
+        inputs = inputs.view((-1, ) + inputs.shape[2:])
+        return self.backbone(inputs)
+
+    def loss(self, inputs, data_samples):
+        feats = self.extract_feat(inputs)
+        loss = self.cls_head.loss(feats, data_samples)
+        return loss
+
+    def predict(self, inputs, data_samples):
+        feats = self.extract_feat(inputs)
+        predictions = self.cls_head.predict(feats, data_samples)
+        return predictions
+
+    def forward(self, inputs, data_samples=None, mode='tensor'):
+        if mode == 'tensor':
+            return self.extract_feat(inputs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode: {mode}')
+```
+
+`init_cfg` 用于模型权重初始化。有关模型权重初始化的更多信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/initialize.html) 。上述模块的用法如下:
+
+```python
+import torch
+import copy
+from mmaction.registry import MODELS
+
+model_cfg = dict(
+    type='RecognizerZelda',
+    backbone=dict(type='BackBoneZelda'),
+    cls_head=dict(
+        type='ClsHeadZelda',
+        num_classes=2,
+        in_channels=128,
+        average_clips='prob'),
+    data_preprocessor = dict(
+        type='DataPreprocessorZelda',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]))
+
+model = MODELS.build(model_cfg)
+
+# 训练
+model.train()
+model.init_weights()
+data_batch_train = copy.deepcopy(batched_packed_results)
+data = model.data_preprocessor(data_batch_train, training=True)
+loss = model(**data, mode='loss')
+print('loss dict: ', loss)
+
+# 验证
+with torch.no_grad():
+    model.eval()
+    data_batch_test = copy.deepcopy(batched_packed_results)
+    data = model.data_preprocessor(data_batch_test, training=False)
+    predictions = model(**data, mode='predict')
+print('Label of Sample[0]', predictions[0].gt_labels.item)
+print('Scores of Sample[0]', predictions[0].pred_scores.item)
+```
+
+```shell
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.weight - torch.Size([64, 3, 3, 7, 7]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.bias - torch.Size([64]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.weight - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.bias - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.weight - torch.Size([128, 64, 3, 3, 3]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.bias - torch.Size([128]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.weight - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.bias - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.weight - torch.Size([2, 128]):
+NormalInit: mean=0, std=0.01, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.bias - torch.Size([2]):
+NormalInit: mean=0, std=0.01, bias=0
+
+loss dict:  {'loss_cls': tensor(0.6853, grad_fn=<NllLossBackward0>)}
+Label of Sample[0] tensor([0])
+Scores of Sample[0] tensor([0.5240, 0.4760])
+```
+
+## 步骤4：构建一个评估指标
+
+请注意，`OpenMMLab` 中的所有 `Metric` 类都必须继承自 `mmengine` 中的 `BaseMetric` 类，并实现抽象方法 `process` 和`compute_metrics`。有关评估的更多信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html) 。
+
+```python
+import copy
+from collections import OrderedDict
+from mmengine.evaluator import BaseMetric
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class AccuracyMetric(BaseMetric):
+    def __init__(self, topk=(1, 5), collect_device='cpu', prefix='acc'):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.topk = topk
+
+    def process(self, data_batch, data_samples):
+        data_samples = copy.deepcopy(data_samples)
+        for data_sample in data_samples:
+            result = dict()
+            scores = data_sample['pred_scores']['item'].cpu().numpy()
+            label = data_sample['gt_labels']['item'].item()
+            result['scores'] = scores
+            result['label'] = label
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        eval_results = OrderedDict()
+        labels = [res['label'] for res in results]
+        scores = [res['scores'] for res in results]
+        topk_acc = top_k_accuracy(scores, labels, self.topk)
+        for k, acc in zip(self.topk, topk_acc):
+            eval_results[f'topk{k}'] = acc
+        return eval_results
+```
+
+```python
+from mmaction.registry import METRICS
+
+metric_cfg = dict(type='AccuracyMetric', topk=(1, 5))
+
+metric = METRICS.build(metric_cfg)
+
+data_samples = [d.to_dict() for d in predictions]
+
+metric.process(batched_packed_results, data_samples)
+acc = metric.compute_metrics(metric.results)
+print(acc)
+```
+
+```shell
+OrderedDict([('topk1', 0.5), ('topk5', 1.0)])
+```
+
+## 步骤5：使用本地 PyTorch 训练和测试
+
+```python
+import torch.optim as optim
+from mmengine import track_iter_progress
+
+
+device = 'cuda' # or 'cpu'
+max_epochs = 10
+
+optimizer = optim.Adam(model.parameters(), lr=0.01)
+
+for epoch in range(max_epochs):
+    model.train()
+    losses = []
+    for data_batch in track_iter_progress(train_data_loader):
+        data = model.data_preprocessor(data_batch, training=True)
+        loss_dict = model(**data, mode='loss')
+        loss = loss_dict['loss_cls']
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        losses.append(loss.item())
+
+    print(f'Epoch[{epoch}]: loss ', sum(losses) / len(train_data_loader))
+
+    with torch.no_grad():
+        model.eval()
+        for data_batch in track_iter_progress(val_data_loader):
+            data = model.data_preprocessor(data_batch, training=False)
+            predictions = model(**data, mode='predict')
+            data_samples = [d.to_dict() for d in predictions]
+            metric.process(data_batch, data_samples)
+
+        acc = metric.acc = metric.compute_metrics(metric.results)
+        for name, topk in acc.items():
+            print(f'{name}: ', topk)
+```
+
+## 步骤6：使用 MMEngine 训练和测试（推荐）
+
+关于训练和测试的更多细节，你可以参考[ MMAction2 教程](https://mmaction2.readthedocs.io/en/latest/user_guides/train_test.html) 。有关 `Runner` 的更多信息，请参阅[ MMEngine 教程](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html) 。
+
+```python
+from mmengine.runner import Runner
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1)
+val_cfg = dict(type='ValLoop')
+
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.01))
+
+runner = Runner(model=model_cfg, work_dir='./work_dirs/guide',
+                train_dataloader=train_dataloader_cfg,
+                train_cfg=train_cfg,
+                val_dataloader=val_dataloader_cfg,
+                val_cfg=val_cfg,
+                optim_wrapper=optim_wrapper,
+                val_evaluator=[metric_cfg],
+                default_scope='mmaction')
+runner.train()
+```
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
new file mode 100644
index 0000000000..0e144ce6eb
--- /dev/null
+++ b/docs/zh_cn/get_started/installation.md
@@ -0,0 +1,198 @@
+# 安装
+
+## 前置条件
+
+在本节中，我们将演示如何准备 PyTorch 相关的依赖环境。
+
+MMAction2 适用于 Linux、Windows 和 MacOS。它需要 Python 3.7+，CUDA 10.2+ 和 PyTorch 1.8+。
+
+```{note}
+如果您熟悉 PyTorch 并且已经安装了它，可以跳过这部分内容，直接转到[下一节](#installation)。否则，您可以按照以下步骤进行准备工作。
+```
+
+**第一步。** 从[官方网站](https://docs.conda.io/en/latest/miniconda.html)下载并安装 Miniconda。
+
+**第二步。** 创建一个 conda 环境并激活它。
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate openmmlab
+```
+
+**第三步。** 安装 PyTorch，按照[官方说明](https://pytorch.org/get-started/locally/)进行操作，例如：
+
+在 GPU 平台上：
+
+```shell
+conda install pytorch torchvision -c pytorch
+```
+
+```{warning}
+此命令将自动安装最新版本的 PyTorch 和 cudatoolkit，请确保它们与您的环境匹配。
+```
+
+在 CPU 平台上：
+
+```shell
+conda install pytorch torchvision cpuonly -c pytorch
+```
+
+## 最佳实践
+
+我们建议用户遵循我们的最佳实践来安装 MMAction2。然而，整个过程是高度可定制的。更多信息请参见[自定义安装](#customize-installation)部分。
+
+**第一步。** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMEngine](https://github.com/open-mmlab/mmengine)、[MMCV](https://github.com/open-mmlab/mmcv)、[MMDetection](https://github.com/open-mmlab/mmdetection)（可选）和 [MMPose](https://github.com/open-mmlab/mmpose)（可选）。
+
+```shell
+pip install -U openmim
+mim install mmengine
+mim install mmcv
+mim install mmdet
+mim install mmpose
+```
+
+**第二步。** 安装 MMAction2。
+
+根据您的需求，我们支持两种安装模式：
+
+- [从源代码构建 MMAction2（推荐）](#build-mmaction2-from-source)：您想在 MMAction2 框架上开发自己的动作识别任务或新功能。例如，添加新的数据集或新的模型。因此，您可以使用我们提供的所有工具。
+- [安装为 Python 包](#install-as-a-python-package)：您只想在项目中调用 MMAction2 的 API 或导入 MMAction2 的模块。
+
+### 从源代码构建 MMAction2
+
+在这种情况下，从源代码安装 mmaction2：
+
+```shell
+git clone https://github.com/open-mmlab/mmaction2.git
+cd mmaction2
+pip install -v -e .
+# "-v" 表示输出更多安装相关的信息
+# "-e" 表示以可编辑形式安装，这样可以在不重新安装的情况下，让本地修改直接生效。
+```
+
+可选地，如果您希望为 MMAction2 做出贡献或体验实验功能，请切换到 `dev-1.x` 分支：
+
+```shell
+git checkout dev-1.x
+```
+
+### 安装为 Python 包
+
+只需使用 pip 安装即可。
+
+```shell
+pip install mmaction2
+```
+
+## 验证安装
+
+为了验证 MMAction2 是否安装正确，我们提供了一些示例代码来运行推理演示。
+
+**第一步。** 下载配置文件和权重文件。
+
+```shell
+mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .
+```
+
+**第二步。** 验证推理演示。
+
+选项（a）。如果您是从源代码安装的 mmaction2，可以运行以下命令：
+
+```shell
+# demo.mp4 和 label_map_k400.txt 都来自于 Kinetics-400
+python demo/demo.py tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py \
+    tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth \
+    demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
+```
+
+您将在终端看到前5个标签及其对应的分数。
+
+选项（b）。如果您将 mmaction2 安装为一个 Python 包，可以在 Python 解释器中运行以下代码，这将进行类似的验证：
+
+```python
+from operator import itemgetter
+from mmaction.apis import init_recognizer, inference_recognizer
+
+config_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'
+checkpoint_file = 'tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth'
+video_file = 'demo/demo.mp4'
+label_file = 'tools/data/kinetics/label_map_k400.txt'
+model = init_recognizer(config_file, checkpoint_file, device='cpu')  # or device='cuda:0'
+pred_result = inference_recognizer(model, video_file)
+
+pred_scores = pred_result.pred_scores.item.tolist()
+score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))
+score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)
+top5_label = score_sorted[:5]
+
+labels = open(label_file).readlines()
+labels = [x.strip() for x in labels]
+results = [(labels[k[0]], k[1]) for k in top5_label]
+
+print('The top-5 labels with corresponding scores are:')
+for result in results:
+    print(f'{result[0]}: ', result[1])
+```
+
+## 自定义安装
+
+### CUDA 版本
+
+在安装 PyTorch 时，您可能需要指定 CUDA 的版本。如果您不确定选择哪个版本，请遵循我们的建议：
+
+- 对于 Ampere 架构的 NVIDIA GPU，例如 GeForce 30 series 以及 NVIDIA A100，CUDA 11 是必需的。
+- 对于更早的 NVIDIA GPU，CUDA 11 是向前兼容的，但 CUDA 10.2 能够提供更好的兼容性，也更加轻量。
+
+请确保 GPU 驱动程序满足最低版本要求。有关更多信息，请参见[此表格](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions)。
+
+```{note}
+如果按照我们的最佳实践进行安装，仅安装 CUDA 运行时库就足够了，因为不会在本地编译任何 CUDA 代码。然而，如果您希望从源代码编译 MMCV 或开发其他 CUDA 运算符，您需要从 NVIDIA 的[网站](https://developer.nvidia.com/cuda-downloads)安装完整的 CUDA 工具包，并且其版本应与 PyTorch 的 CUDA 版本匹配，即 `conda install` 命令中指定的 cudatoolkit 的版本。
+```
+
+### 不使用 MIM 安装 MMCV
+
+MMCV 包含 C++ 和 CUDA 扩展，因此它与 PyTorch 的关系比较复杂。MIM 可以自动解决这些依赖关系，使安装变得更加容易。但这不是必须的。
+
+如果您希望使用 pip 而不是 MIM 安装 MMCV，请参考[MMCV 安装指南](https://mmcv.readthedocs.io/en/latest/get_started/installation.html)。这需要手动指定基于 PyTorch 版本和其 CUDA 版本的 find-url。
+
+例如，以下命令安装了为 PyTorch 1.10.x 和 CUDA 11.3 构建的 mmcv。
+
+```shell
+pip install mmcv -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
+```
+
+### 在 CPU 环境中安装
+
+MMAction2 可以仅在 CPU 环境中安装。在 CPU 模式下，你可以完成训练、测试和模型推理等所有操作。
+
+在 CPU 模式下，MMCV 的部分功能将不可用，通常是一些 GPU 编译的算子。不过不用担心， MMAction2 中几乎所有的模型都不会依赖这些算子。
+
+### 通过 Docker 使用 MMAction2
+
+我们提供了一个[Dockerfile](https://github.com/open-mmlab/mmaction2/blob/main/docker/Dockerfile)来构建镜像。确保您的[docker 版本](https://docs.docker.com/engine/install/) >=19.03。
+
+```shell
+# 构建一个基于 PyTorch 1.6.0、CUDA 10.1 和 CUDNN 7 的镜像。
+# 如果您喜欢其他版本，请修改 Dockerfile。
+docker build -f ./docker/Dockerfile --rm -t mmaction2 .
+```
+
+使用以下命令运行它：
+
+```shell
+# 例如构建PyTorch 1.6.0, CUDA 10.1, CUDNN 7的镜像
+# 如果你喜欢其他版本,只要修改Dockerfile
+docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmaction2/data mmaction2
+```
+
+## 故障排除
+
+1. 当从旧版本 `0.x` 迁移到新版本 `1.x` 时，您可能会遇到依赖库版本不匹配的问题。下面是在按照上述安装过程执行后，通过 `pip list` 命令显示的每个依赖库的版本。请确保在终端中显示的每个依赖库版本都大于或等于（即 `>=`）下面每个依赖库的版本。
+
+```shell
+mmaction2                1.0.0
+mmcv                     2.0.0
+mmdet                    3.0.0
+mmengine                 0.7.2
+mmpose                   1.0.0
+```
diff --git a/docs/zh_cn/get_started/overview.md b/docs/zh_cn/get_started/overview.md
new file mode 100644
index 0000000000..759b2290f2
--- /dev/null
+++ b/docs/zh_cn/get_started/overview.md
@@ -0,0 +1,97 @@
+# 概述
+
+## 什么是 MMAction2
+
+MMAction2 是一个基于 PyTorch 的开源工具包，支持了大量的视频理解模型，包括**行为识别、基于骨架的行为识别、时空行为检测和时序动作定位**等多个主要方向。它还支持了大多数流行的学术数据集，并提供了许多实用工具帮助用户对数据集和模型进行多方面的探索和调试。它具有以下特点：
+
+**全流程，多模型**：MMAction2 支持各种视频理解任务，实现了最先进的行为识别、定位、检测模型。
+
+**模块化设计**：MMAction2 的模块化设计使用户可以根据需要定义和重用模型中的模块。
+
+**实用工具众多**：MMAction2 提供了一系列的分析工具，如可视化器、验证脚本、评估器等，以帮助用户进行故障排除、微调或比较模型。
+
+**由 OpenMMLab 强力驱动**：与家族内的其它算法库一样，MMAction2 遵循着 OpenMMLab 严谨的开发准则和接口约定，极大地降低了用户切换各算法库时的学习成本。同时，MMAction2 也可以非常便捷地与家族内其他算法库跨库联动，从而满足用户跨领域研究和落地的需求。
+
+<table><tr>
+  <td><img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/mmaction2_overview.gif" width="380px">
+    <p style="text-align: center;">行为识别</p></td>
+  <td><img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px"><br>
+    <p style="text-align: center;">基于骨架的行为识别</p></td>
+</table></tr>
+<table><tr>
+  <td><img src="https://user-images.githubusercontent.com/30782254/155710881-bb26863e-fcb4-458e-b0c4-33cd79f96901.gif" width="380px">
+    <p style="text-align: center;">时空动作检测</p></td>
+  <td><img src="https://github.com/open-mmlab/mmaction2/raw/main/resources/spatio-temporal-det.gif" width="380px"><br>
+    <p style="text-align: center;">时空动作检测</p></td>
+</table></tr>
+
+## 如何使用文档
+
+针对不同类型的用户，我们准备了详细的指南：
+
+<details open>
+<summary><b> MMAction2 的基础用法</b></summary>
+
+- [安装](installation.md)
+- [快速运行](quick_run.md)
+- [利用现有模型进行推理](../user_guides/inference.md)
+
+</details>
+
+<details open>
+<summary><b>关于在已支持的数据集上进行训练</b></summary>
+
+- [了解配置文件](../user_guides/config.md)
+- [准备数据集](../user_guides/prepare_dataset.md)
+- [训练与测试](../user_guides/train_test.md)
+
+</details>
+
+<details open>
+<summary><b>关于使用过程中的常见问题</b></summary>
+
+- [常见问题解答](faq.md)
+- [有用的工具](../useful_tools.md)
+
+</details>
+
+<details open>
+<summary><b>关于 MMAction2 的框架设计</b></summary>
+
+- [20分钟 MMAction2 框架指南](guide_to_framework.md)
+- [MMAction2 中的数据流](../advanced_guides/dataflow.md)
+
+</details>
+
+<details open>
+<summary><b>关于自定义训练的高级用法</b></summary>
+
+- [自定义模型](../advanced_guides/customize_models.md)
+- [自定义数据集](../advanced_guides/customize_dataset.md)
+- [自定义数据管道](../advanced_guides/customize_pipeline.md)
+- [自定义优化器](../advanced_guides/customize_optimizer.md)
+- [自定义日志记录](../advanced_guides/customize_logging.md)
+
+</details>
+
+<details open>
+<summary><b>关于支持的模型库和数据集</b></summary>
+
+- [模型库](../modelzoo_statistics.md)
+- [数据集](../datasetzoo_statistics.md)
+
+</details>
+
+<details open>
+<summary><b>关于从 MMAction2 0.x 迁移</b></summary>
+
+- [从 MMAction2 0.x 迁移](../migration.md)
+
+</details>
+
+<details open>
+<summary><b>对于希望加入开源社区，向 MMAction2 贡献代码的研究者和开发者</b></summary>
+
+- [如何为 MMAction2 做出贡献](contribution_guide.md)
+
+</details>
diff --git a/docs/zh_cn/get_started/quick_run.md b/docs/zh_cn/get_started/quick_run.md
new file mode 100644
index 0000000000..00e984c4b2
--- /dev/null
+++ b/docs/zh_cn/get_started/quick_run.md
@@ -0,0 +1,219 @@
+# 快速运行
+
+本章将介绍 MMAction2 的基本功能。我们假设你已经[源码安装 MMAction2](installation.md#best-practices)。
+
+- [快速运行](#快速运行)
+  - [推理](#推理)
+  - [准备数据集](#准备数据集)
+  - [修改配置](#修改配置)
+    - [修改数据集](#修改数据集)
+    - [修改运行配置](#修改运行配置)
+    - [修改模型配置](#修改模型配置)
+  - [浏览数据集](#浏览数据集)
+  - [训练](#训练)
+  - [测试](#测试)
+
+## 推理
+
+在 MMAction2 的根目录下执行如下命令:
+
+```shell
+python demo/demo_inferencer.py  demo/demo.mp4 \
+    --rec tsn --print-result \
+    --label-file tools/data/kinetics/label_map_k400.txt
+```
+
+您应该能够看到弹出的视频窗口，和在控制台中打印的推断结果。
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/33249023/227216933-29b84ac7-ca0e-408d-b4d2-5a2e5a7357bf.gif" height="250"/>
+</div>
+<br />
+
+```bash
+# 推理结果
+{'predictions': [{'rec_labels': [[6]], 'rec_scores': [[...]]}]}
+```
+
+```{note}
+如果您在没有 GUI 的服务器上运行 MMAction2，或者通过禁用 X11 转发的 SSH 隧道运行 MMAction2，则可能不会看到弹出窗口。
+```
+
+关于 MMAction2 推理接口的详细描述可以在[这里](/demo/README.md#inferencer)找到.
+
+除了使用我们提供的预训练模型，您还可以在自己的数据集上训练模型。在下一节中，我们将通过在精简版 [Kinetics](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) 数据集上训练 TSN 为例，带您了解 MMAction2 的基本功能。
+
+## 准备数据集
+
+由于视频数据集格式的多样性不利于数据集的切换，MMAction2 提出了统一的[数据格式](../user_guides/prepare_dataset.md) ，并为常用的视频数据集提供了[数据集准备指南](../user_guides/data_prepare/dataset_prepare.md)。通常，要在 MMAction2 中使用这些数据集，你只需要按照步骤进行准备。
+
+```{笔记}
+但在这里，效率意味着一切。
+```
+
+首先，请下载我们预先准备好的 [kinetics400_tiny.zip](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) ，并将其解压到 MMAction2 根目录下的 `data/` 目录。这将为您提供必要的视频和注释文件。
+
+```Bash
+wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip
+mkdir -p data/
+unzip kinetics400_tiny.zip -d data/
+```
+
+## 修改配置
+
+准备好数据集之后，下一步是修改配置文件，以指定训练集和训练参数的位置。
+
+在本例中，我们将使用 resnet50 作为主干网络来训练 TSN。由于 MMAction2 已经有了完整的 Kinetics400 数据集的配置文件 (`configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`)，我们只需要在其基础上进行一些修改。
+
+### 修改数据集
+
+我们首先需要修改数据集的路径。打开 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` ，按如下替换关键字:
+
+```Python
+data_root = 'data/kinetics400_tiny/train'
+data_root_val = 'data/kinetics400_tiny/val'
+ann_file_train = 'data/kinetics400_tiny/kinetics_tiny_train_video.txt'
+ann_file_val = 'data/kinetics400_tiny/kinetics_tiny_val_video.txt'
+```
+
+### 修改运行配置
+
+此外，由于数据集的大小减少，我们建议将训练批大小减少到4个，训练epoch的数量相应减少到10个。此外，我们建议将验证和权值存储间隔缩短为1轮，并修改学习率衰减策略。修改 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 中对应的关键字，如下所示生效。
+
+```python
+# 设置训练批大小为 4
+train_dataloader['batch_size'] = 4
+
+# 每轮都保存权重，并且只保留最新的权重
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1))
+# 将最大 epoch 数设置为 10，并每 1 个 epoch验证模型
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1)
+#根据 10 个 epoch调整学习率调度
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=10,
+        by_epoch=True,
+        milestones=[4, 8],
+        gamma=0.1)
+]
+```
+
+### 修改模型配置
+
+此外，由于精简版 Kinetics 数据集规模较小，建议加载原始 Kinetics 数据集上的预训练模型。此外，模型需要根据实际类别数进行修改。请直接将以下代码添加到 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 中。
+
+```python
+model = dict(
+    cls_head=dict(num_classes=2))
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+```
+
+在这里，我们直接通过继承 ({external+mmengine:doc} `MMEngine: Config <advanced_tutorials/ Config>`) 机制重写了基本配置中的相应参数。原始字段分布在 `configs/_base_/models/tsn_r50.py`、`configs/_base_/schedules/sgd_100e.py` 和 `configs/_base_/default_runtime.py`中。
+
+```{note}
+关于配置的更详细的描述，请参考[这里](../user_guides/config.md)。
+```
+
+## 浏览数据集
+
+在开始训练之前，我们还可以将训练时数据转换处理的帧可视化。这很简单：传递我们需要可视化的配置文件到 [browse_dataset.py](/tools/analysis_tools/browse_dataset.py)脚本中。
+
+```Bash
+python tools/visualizations/browse_dataset.py \
+    configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+    browse_out --mode pipeline
+```
+
+转换后的视频将被保存到 `browse_out` 文件夹中。
+
+<center class="half">
+    <img src="https://user-images.githubusercontent.com/33249023/227452030-81895695-8a9b-45be-922a-3d9d86baf65d.gif" height="250"/>
+</center>
+
+```{note}
+有关该脚本的参数和使用方法的详细信息，请参考[这里](../user_guides/useful_tools.md)。
+```
+
+```{tip}
+除了满足我们的好奇心，可视化还可以帮助我们在训练前检查可能影响模型性能的部分，例如配置、数据集和数据转换中的问题。
+```
+
+我们可以通过以下脚本进一步可视化学习率调度，以确保配置符合预期:
+
+```Bash
+python tools/visualizations/vis_scheduler.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+训练学习率时间表将显示在弹出窗口中。
+
+<center class="half">
+    <img src="https://user-images.githubusercontent.com/33249023/227502329-6fd44259-e23b-46e0-8e19-29f9b664f4e2.png" height="250"/>
+</center>
+
+```{note}
+学习率根据实际批数据大小自动缩放。
+```
+
+## 训练
+
+运行如下命令启动训练:
+
+```Bash
+python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+根据系统环境，MMAction2 将自动使用最佳设备进行训练。如果有GPU，则默认启动单个GPU训练。当你开始看到 loss 的输出时，就说明你已经成功启动了训练。
+
+```Bash
+03/24 16:36:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:15 - mmengine - INFO - Epoch(train)  [1][8/8]  lr: 1.5625e-04  eta: 0:00:15  time: 0.2151  data_time: 0.0845  memory: 1314  grad_norm: 8.5647  loss: 0.7267  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.7267
+03/24 16:36:16 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:16 - mmengine - INFO - Epoch(train)  [2][8/8]  lr: 1.5625e-04  eta: 0:00:12  time: 0.1979  data_time: 0.0717  memory: 1314  grad_norm: 8.4709  loss: 0.7130  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.7130
+03/24 16:36:18 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:18 - mmengine - INFO - Epoch(train)  [3][8/8]  lr: 1.5625e-04  eta: 0:00:10  time: 0.1691  data_time: 0.0478  memory: 1314  grad_norm: 8.2910  loss: 0.6900  top1_acc: 0.5000  top5_acc: 1.0000  loss_cls: 0.6900
+03/24 16:36:18 - mmengine - INFO - Saving checkpoint at 3 epochs
+03/24 16:36:19 - mmengine - INFO - Epoch(val) [3][1/1]  acc/top1: 0.9000  acc/top5: 1.0000  acc/mean1: 0.9000data_time: 1.2716  time: 1.3658
+03/24 16:36:20 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 3 epoch is saved to best_acc/top1_epoch_3.pth.
+```
+
+在没有额外配置的情况下，模型权重将被保存到 `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`，而日志将被存储到 `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`。接下来，我们只需要耐心等待训练完成。
+
+```{note}
+训练的高级用法，如 CPU 训练、多卡训练及集群训练，请参考[training and Testing](../user_guides/train_test.md)
+```
+
+## 测试
+
+经过 10 个 epoch 后，我们观察到 TSN 在第 6 个 epoch 表现最好，`acc/top1` 达到1.0000:
+
+```Bash
+03/24 16:36:25 - mmengine - INFO - Epoch(val) [6][1/1]  acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 1.0000data_time: 1.0210  time: 1.1091
+```
+
+```{note}
+由于在原始 Kinetics400 上进行了预训练，结果非常高，您可能会看到不同的结果
+```
+
+然而，该值仅反映了 TSN 在精简版 Kinetics 数据集上的验证性能，而测试结果通常更高，因为在测试数据流水线中增加了更多的数据增强。
+
+开始测试：
+
+```Bash
+python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+    work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/best_acc/top1_epoch_6.pth
+```
+
+并得到如下输出:
+
+```Bash
+03/24 17:00:59 - mmengine - INFO - Epoch(test) [10/10]  acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 0.9000data_time: 0.0420  time: 1.0795
+```
+
+该模型在该数据集上实现了 1.000 的 top1 准确率。
+
+```{note}
+测试的高级用法，如CPU测试、多gpu测试、集群测试，请参考[Training and testing](../user_guides/train_test.md)
+```
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 2b69d6d2af..0571bb34d6 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -1,57 +1,89 @@
-Welcome to MMAction2's documentation!
+欢迎来到 MMAction2 中文教程!
 =====================================
 
 You can switch between Chinese and English documents in the lower-left corner of the layout.
 
 .. toctree::
    :maxdepth: 1
-   :caption: Get Started
+   :caption: 新手入门
 
-   get_started.md
+   get_started/overview.md
+   get_started/installation.md
+   get_started/quick_run.md
+   get_started/guide_to_framework.md
+   get_started/contribution_guide.md
+   get_started/faq.md
 
 .. toctree::
    :maxdepth: 1
-   :caption: User Guides
+   :caption: 用户指南
 
-   user_guides/1_config.md
-   user_guides/2_data_prepare.md
-   user_guides/3_inference.md
+   user_guides/inference.md
+   user_guides/config.md
    user_guides/train_test.md
+   user_guides/prepare_dataset.md
+   user_guides/finetune.md
 
 .. toctree::
    :maxdepth: 1
-   :caption: Useful Tools
+   :caption: 进阶教程
 
-   user_guides/useful_tools.md
-   user_guides/visualization.md
+   advanced_guides/dataflow.md
+   advanced_guides/customize_models.md
+   advanced_guides/customize_dataset.md
+   advanced_guides/customize_pipeline.md
+   advanced_guides/customize_optimizer.md
+   advanced_guides/customize_logging.md
+   advanced_guides/deploy.md
+   useful_tools.md
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 模型支持
+
+   modelzoo_statistics.md
+   model_zoo/recognition.md
+   model_zoo/recognition_audio.md
+   model_zoo/skeleton.md
+   model_zoo/detection.md
+   model_zoo/retrieval.md
+   model_zoo/localization.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 数据集支持
+   :glob:
+
+   datasetzoo_statistics.md
+   dataset_zoo/*
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 相关项目
+
+   projectzoo.md
 
 .. toctree::
    :maxdepth: 1
-   :caption: Migration
+   :caption: MMAction2 0.x 迁移指南
 
    migration.md
 
 .. toctree::
    :maxdepth: 1
-   :caption: Model Zoo
+   :caption: API 参考文档
 
-   modelzoo.md
-   recognition_models.md
-   detection_models.md
-   skeleton_models.md
-   localization_models.md
+   api.rst
 
 .. toctree::
    :maxdepth: 1
-   :caption: Notes
+   :caption: 其他说明
 
-   notes/contribution_guide.md
-   notes/projects.md
-   notes/changelog.md
-   notes/faq.md
+   notes/ecosystem.md
 
 .. toctree::
-   :caption: Switch Language
+   :caption: 切换语言
 
    switch_language.md
 
diff --git a/docs/zh_cn/merge_docs.sh b/docs/zh_cn/merge_docs.sh
deleted file mode 100644
index aa2a9bebfd..0000000000
--- a/docs/zh_cn/merge_docs.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-## gather models
-cat  ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > localization_models.md
-cat  ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > recognition_models.md
-cat  ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >> recognition_models.md
-cat  ../../configs/detection/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > detection_models.md
-cat  ../../configs/skeleton/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > skeleton_models.md
diff --git a/docs/zh_cn/migration.md b/docs/zh_cn/migration.md
index 90ab61e459..e13e2ae1e5 100644
--- a/docs/zh_cn/migration.md
+++ b/docs/zh_cn/migration.md
@@ -1 +1,488 @@
-# 从 MMAction2 0.x 迁移 (内容建设中)
+# 从 MMAction2 0.x 迁移
+
+MMAction2 1.x 引入了一些重构和修改，包括一些向后不兼容的更改。我们提供这个教程，帮助您从 MMAction2 0.x 迁移您的项目。
+
+## 新的依赖项
+
+MMAction2 1.x 依赖于以下库。建议您准备一个新的运行环境，并根据[安装教程](./get_started/installation.md)进行安装。
+
+1. [MMEngine](https://github.com/open-mmlab/mmengine)：MMEngine 是引入于 OpenMMLab 2.0 架构中的用于训练深度学习模型的基础库。
+2. [MMCV](https://github.com/open-mmlab/mmcv)：MMCV 是用于计算机视觉的基础库。MMAction2 1.x 需要 `mmcv>=2.0.0`，它比 `mmcv-full==2.0.0` 更紧凑和高效。
+
+## 配置文件
+
+在 MMAction2 1.x 中，我们重构了配置文件的结构。旧风格的配置文件将不兼容。
+
+在本节中，我们将介绍配置文件的所有更改。我们假设您已经熟悉[配置文件](./user_guides/config.md)。
+
+### 模型设置
+
+`model.backbone` 和 `model.neck` 没有更改。对于 `model.cls_head`，我们将 `average_clips` 移到其中，原本设置在 `model.test_cfg` 中。
+
+### 数据设置
+
+#### **`data`** 中的更改
+
+- 原始的 `data` 字段被拆分为 `train_dataloader`、`val_dataloader` 和 `test_dataloader`。这样可以对它们进行细粒度的配置。例如，您可以在训练和测试过程中指定不同的采样器和批大小。
+- `videos_per_gpu` 改名为 `batch_size`。
+- `workers_per_gpu` 改名为 `num_workers`。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=2,
+    train=dict(...),
+    val=dict(...),
+    test=dict(...),
+)
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=2,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=True)  # 必要
+)
+
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=2,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=False)  # 必要
+)
+
+test_dataloader = val_dataloader
+```
+
+</td>
+</tr>
+</table>
+
+#### **`pipeline`** 中的更改
+
+- 原来的格式化变换 **`ToTensor`**、**`Collect`** 被合并为 `PackActionInputs`。
+- 我们不建议在数据集流水线中进行 **`Normalize`**。请从流水线中移除它，并在 `model.data_preprocessor` 字段中设置。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+model.data_preprocessor = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=5),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+</td>
+</tr>
+</table>
+
+#### **`evaluation`** 中的更改
+
+- **`evaluation`** 字段被拆分为 `val_evaluator` 和 `test_evaluator`。不再支持 `interval` 和 `save_best` 参数。
+- `interval` 移到 `train_cfg.val_interval`，`save_best` 移到 `default_hooks.checkpoint.save_best`。
+- 'mean_average_precision'、'mean_class_accuracy'、'mmit_mean_average_precision'、'top_k_accuracy' 被合并为 `AccMetric`，您可以使用 `metric_list` 指定要计算的指标。
+- `AVAMetric` 用于评估 AVA 数据集。
+- `ANetMetric` 用于评估 ActivityNet 数据集。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+evaluation = dict(
+    interval=5,
+    metrics=['top_k_accuracy', 'mean_class_accuracy'])
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+val_evaluator = dict(
+    type='AccMetric',
+    metric_list=('top_k_accuracy', 'mean_class_accuracy'))
+test_evaluator = val_evaluator
+```
+
+</td>
+</tr>
+</table>
+
+### 学习率策略设置
+
+#### **`optimizer`** 和 **`optimizer_config`** 中的更改
+
+- 现在我们使用 `optim_wrapper` 字段来配置优化过程。`optimizer` 成为 `optim_wrapper` 的子字段。
+- `paramwise_cfg` 也是 `optim_wrapper` 的子字段，与 `optimizer` 平行。
+- 现在已删除 `optimizer_config`，其中的所有配置都移动到 `optim_wrapper`。
+- `grad_clip` 改名为 `clip_grad`。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+optimizer = dict(
+    type='AdamW',
+    lr=0.0015,
+    weight_decay=0.3,
+    paramwise_cfg = dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+    ))
+
+optimizer_config = dict(grad_clip=dict(max_norm=1.0))
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0015, weight_decay=0.3),
+    paramwise_cfg = dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+    ),
+    clip_gard=dict(max_norm=1.0),
+)
+```
+
+</td>
+</tr>
+</table>
+
+#### **`lr_config`** 中的更改
+
+- 删除了 `lr_config` 字段，我们使用新的 `param_scheduler` 来替代它。
+- 删除了与 warmup 相关的参数，因为我们使用策略组合来实现这个功能。
+
+新的组合机制非常灵活，您可以使用它来设计多种学习率/动量曲线。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+lr_config = dict(
+    policy='CosineAnnealing',
+    min_lr=0,
+    warmup='linear',
+    warmup_iters=5,
+    warmup_ratio=0.01,
+    warmup_by_epoch=True)
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+param_scheduler = [
+    # 学习率预热
+    dict(
+        type='LinearLR',
+        start_factor=0.01,
+        by_epoch=True,
+        end=5,
+        # 在每个迭代后更新学习率。
+        convert_to_iter_based=True),
+    # 主要的学习率策略
+    dict(type='CosineAnnealingLR', by_epoch=True, begin=5),
+]
+```
+
+</td>
+</tr>
+</table>
+
+#### **`runner`** 中的更改
+
+原始 `runner` 字段中的大多数配置已移至 `train_cfg`、`val_cfg` 和 `test_cfg`，用于配置训练、验证和测试的循环。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+runner = dict(type='EpochBasedRunner', max_epochs=100)
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+# `val_interval` 是原 `evaluation.interval`。
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')   # 使用默认验证循环。
+test_cfg = dict(type='TestLoop')  # 使用默认测试循环。
+```
+
+</td>
+</tr>
+</table>
+
+事实上，在 OpenMMLab 2.0 中，我们引入了 `Loop` 来控制训练、验证和测试的行为。`Runner` 的功能也发生了变化。您可以在[MMEngine 教程](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html)中找到更多详细信息。
+
+### 运行时设置
+
+#### **`checkpoint_config`** 和 **`log_config`** 中的更改
+
+`checkpoint_config` 移到 `default_hooks.checkpoint`，`log_config` 移到 `default_hooks.logger`。我们将许多钩子的设置从脚本代码中移动到运行时配置的 `default_hooks` 字段中。
+
+```python
+default_hooks = dict(
+    # 更新运行时信息，如当前迭代和学习率。
+    runtime_info=dict(type='RuntimeInfoHook'),
+
+    # 记录每个迭代的时间。
+    timer=dict(type='IterTimerHook'),
+
+    # 每 100 次迭代打印日志。
+    logger=dict(type='LoggerHook', interval=100),
+
+    # 启用参数策略器。
+    param_scheduler=dict(type='ParamSchedulerHook'),
+
+    # 每个 epoch 保存一次权重，并自动保存最佳权重。
+    checkpoint=dict(type='CheckpointHook', interval=1, save_best='auto'),
+
+    # 在分布式环境中设置采样器种子。
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+
+    # 在每个 epoch 结束时同步模型缓冲区。
+    sync_buffers=dict(type='SyncBuffersHook')
+)
+```
+
+此外，我们将原来的 logger 拆分为 logger 和 visualizer。logger 用于记录信息，visualizer 用于在不同的后端（如终端、TensorBoard 和 Wandb）中显示 logger。
+
+<table class="docutils">
+<tr>
+<td>旧版本</td>
+<td>
+
+```python
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+    ])
+```
+
+</td>
+<tr>
+<td>新版本</td>
+<td>
+
+```python
+default_hooks = dict(
+    ...
+    logger=dict(type='LoggerHook', interval=100),
+)
+
+visualizer = dict(
+    type='ActionVisualizer',
+    vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')],
+)
+```
+
+</td>
+</tr>
+</table>
+
+#### **`load_from`** 和 **`resume_from`** 中的更改
+
+- 删除了 `resume_from`。现在我们使用 `resume` 和 `load_from` 来替代它。
+  - 如果 `resume=True` 并且 `load_from` 不为 None，则从 `load_from` 中的权重恢复训练。
+  - 如果 `resume=True` 并且 `load_from` 为 None，则尝试从工作目录中的最新权重恢复。
+  - 如果 `resume=False` 并且 `load_from` 不为 None，则只加载权重文件，不恢复训练。
+  - 如果 `resume=False` 并且 `load_from` 为 None，则既不加载也不恢复。
+
+#### **`dist_params`** 中的更改
+
+`dist_params` 字段现在是 `env_cfg` 的子字段。`env_cfg` 中还有一些新的配置。
+
+```python
+env_cfg = dict(
+    # 是否启用 cudnn benchmark
+    cudnn_benchmark=False,
+
+    # 设置多进程参数
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+
+    # 设置分布式参数
+    dist_cfg=dict(backend='nccl'),
+)
+```
+
+#### **`workflow`** 中的更改
+
+删除了与 `workflow` 相关的功能。
+
+#### 新字段 **`visualizer`**
+
+visualizer 是 OpenMMLab 2.0 架构中的新设计。我们在 runner 中使用一个 visualizer 实例来处理结果和日志的可视化，并保存到不同的后端，如终端、TensorBoard 和 Wandb。
+
+```python
+visualizer = dict(
+    type='ActionVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        # 取消下面一行的注释，将日志和可视化结果保存到 TensorBoard。
+        # dict(type='TensorboardVisBackend')
+    ]
+)
+```
+
+#### 新字段 **`default_scope`**
+
+所有注册表在不同包中的定义已移动到 `mmaction.registry` 包中。
+
+## Packages
+
+### `mmaction.apis`
+
+文档可以在[这里](mmaction.apis)找到。
+
+|          函数          |                     更改                     |
+| :--------------------: | :------------------------------------------: |
+|   `init_recognizer`    |                   无需更改                   |
+| `inference_recognizer` |                   无需更改                   |
+|     `train_model`      |      删除，使用 `runner.train` 进行训练      |
+|    `multi_gpu_test`    |      删除，使用 `runner.test` 进行测试       |
+|   `single_gpu_test`    |      删除，使用 `runner.test` 进行测试       |
+|   `set_random_seed`    | 删除，使用 `mmengine.runner.set_random_seed` |
+|   `init_random_seed`   | 删除，使用 `mmengine.dist.sync_random_seed`  |
+
+### `mmaction.core`
+
+`mmaction.core` 包已被重命名为 [`mmaction.engine`](mmaction.engine)。
+
+|     子包     |                           更改                            |
+| :----------: | :-------------------------------------------------------: |
+| `evaluation` |         删除，使用 `mmaction.evaluation` 中的指标         |
+|   `hooks`    |              移动到 `mmaction.engine.hooks`               |
+| `optimizer`  |            移动到 `mmaction.engine.optimizers`            |
+|   `utils`    | 删除，分布式环境相关的函数可以在 `mmengine.dist` 包中找到 |
+
+### `mmaction.datasets`
+
+文档可以在[这里](mmaction.datasets)找到。
+
+#### [`BaseActionDataset`](mmaction.datasets.BaseActionDataset) 中的更改：
+
+|          方法          |                    更改                     |
+| :--------------------: | :-----------------------------------------: |
+| `prepare_train_frames` |           由 `get_data_info` 替换           |
+| `preprare_test_frames` |           由 `get_data_info` 替换           |
+|       `evaluate`       |  删除，使用 `mmengine.evaluator.Evaluator`  |
+|     `dump_results`     | 删除，使用 `mmengine.evaluator.DumpResults` |
+|   `load_annotations`   |           替换为 `load_data_list`           |
+
+现在，您可以编写一个继承自 `BaseActionDataset` 的新 Dataset 类，并仅重写 `load_data_list`。要加载更多的数据信息，您可以像 `RawframeDataset` 和 `AVADataset` 那样重写 `get_data_info`。
+`mmaction.datasets.pipelines` 被重命名为 `mmaction.datasets.transforms`，`mmaction.datasets.pipelines.augmentations` 被重命名为 `mmaction.datasets.pipelines.processing`。
+
+### `mmaction.models`
+
+文档可以在[这里](mmaction.models)找到。所有 **backbones**、**necks** 和 **losses** 的接口没有更改。
+
+[`BaseRecognizer`](mmaction.models.BaseRecognizer) 中的更改：
+
+|      方法       |                                                              更改                                                              |
+| :-------------: | :----------------------------------------------------------------------------------------------------------------------------: |
+| `extract_feat`  | 增强的方法，现在支持三个阶段（`backbone`、`neck`、`head`）的输出特征，并且可以处理不同的模式，如 `train_mode` 和 `test_mode`。 |
+|    `forward`    |         现在只接受三个参数：`inputs`、`data_samples` 和 `mode`。详细信息请参阅[文档](mmaction.models.BaseRecognizer)。         |
+| `forward_train` |                                                       已替换为 `loss`。                                                        |
+| `forward_test`  |                                                      已替换为 `predict`。                                                      |
+|  `train_step`   |                `optimizer` 参数被替换为 `optim_wrapper`，它接受 [`OptimWrapper`](mmengine.optim.OptimWrapper)。                |
+|   `val_step`    |                                    原 `val_step` 与 `train_step` 相同，现在调用 `predict`。                                    |
+|   `test_step`   |                                                  新方法，与 `val_step` 相同。                                                  |
+
+[BaseHead](mmaction.models.BaseHead) 中的更改：
+
+|   方法    |                                                                              更改                                                                              |
+| :-------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| `forward` |                                                                            无需更改                                                                            |
+|  `loss`   | 接受 `feats` 和 `data_samples`，而不是 `cls_score` 和 `labels` 来计算损失。`data_samples` 是 [ActionDataSample](mmaction.structures.ActionDataSample) 的列表。 |
+| `predict` |                                                        接受 `feats` 和 `data_samples` 来预测分类分数。                                                         |
+
+### `mmaction.utils`
+
+|          函数           |                            更改                            |
+| :---------------------: | :--------------------------------------------------------: |
+|      `collect_env`      |                          无需更改                          |
+|    `get_root_logger`    |    删除，使用 `mmengine.MMLogger.get_current_instance`     |
+| `setup_multi_processes` | 删除，使用 `mmengine.utils.dl_utils.setup_multi_processes` |
+
+### 其他更改
+
+- 我们将所有注册器的定义从各个包移动到了 `mmaction.registry` 。
diff --git a/docs/zh_cn/notes/changelog.md b/docs/zh_cn/notes/changelog.md
deleted file mode 100644
index f290b1e081..0000000000
--- a/docs/zh_cn/notes/changelog.md
+++ /dev/null
@@ -1,850 +0,0 @@
-# Changelog
-
-## 1.0.0rc1 (14/10/2022)
-
-**Highlights**
-
-- Support Video Swin Transformer
-
-**New Features**
-
-- Support Video Swin Transformer ([#1939](https://github.com/open-mmlab/mmaction2/pull/1939))
-
-**Improvements**
-
-- Add colab tutorial for 1.x ([#1956](https://github.com/open-mmlab/mmaction2/pull/1956))
-- Support skeleton-based action recognition demo ([#1920](https://github.com/open-mmlab/mmaction2/pull/1920))
-
-**Bug Fixes**
-
-- Fix link in doc ([#1986](https://github.com/open-mmlab/mmaction2/pull/1986), [#1967](https://github.com/open-mmlab/mmaction2/pull/1967), [#1951](https://github.com/open-mmlab/mmaction2/pull/1951), [#1926](https://github.com/open-mmlab/mmaction2/pull/1926),[#1944](https://github.com/open-mmlab/mmaction2/pull/1944), [#1944](https://github.com/open-mmlab/mmaction2/pull/1944), [#1927](https://github.com/open-mmlab/mmaction2/pull/1927), [#1925](https://github.com/open-mmlab/mmaction2/pull/1925))
-- Fix CI ([#1987](https://github.com/open-mmlab/mmaction2/pull/1987), [#1930](https://github.com/open-mmlab/mmaction2/pull/1930), [#1923](https://github.com/open-mmlab/mmaction2/pull/1923))
-- Fix pre-commit hook config ([#1971](https://github.com/open-mmlab/mmaction2/pull/1971))
-- Fix TIN config ([#1912](https://github.com/open-mmlab/mmaction2/pull/1912))
-- Fix UT for BMN and BSN ([#1966](https://github.com/open-mmlab/mmaction2/pull/1966))
-- Fix UT for Recognizer2D ([#1937](https://github.com/open-mmlab/mmaction2/pull/1937))
-- Fix BSN and BMN configs for localization ([#1913](https://github.com/open-mmlab/mmaction2/pull/1913))
-- Modeify ST-GCN configs ([#1913](https://github.com/open-mmlab/mmaction2/pull/1914))
-- Fix typo in migration doc ([#1931](https://github.com/open-mmlab/mmaction2/pull/1931))
-- Remove Onnx related tools ([#1928](https://github.com/open-mmlab/mmaction2/pull/1928))
-- Update TANet readme ([#1916](https://github.com/open-mmlab/mmaction2/pull/1916), [#1890](https://github.com/open-mmlab/mmaction2/pull/1890))
-- Update 2S-AGCN readme ([#1915](https://github.com/open-mmlab/mmaction2/pull/1915))
-- Fix TSN configs ([#1905](https://github.com/open-mmlab/mmaction2/pull/1905))
-- Fix configs for detection ([#1903](https://github.com/open-mmlab/mmaction2/pull/1903))
-- Fix typo in TIN config ([#1904](https://github.com/open-mmlab/mmaction2/pull/1904))
-- Fix PoseC3D readme ([#1899](https://github.com/open-mmlab/mmaction2/pull/1899))
-- Fix ST-GCN configs ([#1891](https://github.com/open-mmlab/mmaction2/pull/1891))
-- Fix audio recognition readme ([#1898](https://github.com/open-mmlab/mmaction2/pull/1898))
-- Fix TSM readme ([#1887](https://github.com/open-mmlab/mmaction2/pull/1887))
-- Fix SlowOnly readme ([#1889](https://github.com/open-mmlab/mmaction2/pull/1889))
-- Fix TRN readme ([#1888](https://github.com/open-mmlab/mmaction2/pull/1888))
-- Fix typo in get_started doc ([#1895](https://github.com/open-mmlab/mmaction2/pull/1895))
-
-## 1.0.0rc0 (09/01/2022)
-
-We are excited to announce the release of MMAction2 v1.0.0rc0.
-MMAction2 1.0.0beta is the first version of MMAction2 1.x, a part of the OpenMMLab 2.0 projects.
-Built upon the new [training engine](https://github.com/open-mmlab/mmengine).
-
-**Highlights**
-
-- **New engines**. MMAction2 1.x is based on MMEngine\](https://github.com/open-mmlab/mmengine), which provides a general and powerful runner that allows more flexible customizations and significantly simplifies the entrypoints of high-level interfaces.
-
-- **Unified interfaces**. As a part of the OpenMMLab 2.0 projects, MMAction2 1.x unifies and refactors the interfaces and internal logics of train, testing, datasets, models, evaluation, and visualization. All the OpenMMLab 2.0 projects share the same design in those interfaces and logics to allow the emergence of multi-task/modality algorithms.
-
-- **More documentation and tutorials**. We add a bunch of documentation and tutorials to help users get started more smoothly. Read it [here](https://github.com/open-mmlab/mmaction2/blob/main/docs/en/migration.md).
-
-**Breaking Changes**
-
-In this release, we made lots of major refactoring and modifications. Please refer to the [migration guide](../migration.md) for details and migration instructions.
-
-## 0.24.0 (05/05/2022)
-
-**Highlights**
-
-- Support different seeds
-
-**New Features**
-
-- Add lateral norm in multigrid config ([#1567](https://github.com/open-mmlab/mmaction2/pull/1567))
-- Add openpose 25 joints in graph config ([#1578](https://github.com/open-mmlab/mmaction2/pull/1578))
-- Support MLU Backend ([#1608](https://github.com/open-mmlab/mmaction2/pull/1608))
-
-**Bug and Typo Fixes**
-
-- Fix local_rank ([#1558](https://github.com/open-mmlab/mmaction2/pull/1558))
-- Fix install typo ([#1571](https://github.com/open-mmlab/mmaction2/pull/1571))
-- Fix the inference API doc ([#1580](https://github.com/open-mmlab/mmaction2/pull/1580))
-- Fix zh-CN demo.md and getting_started.md ([#1587](https://github.com/open-mmlab/mmaction2/pull/1587))
-- Remove Recommonmark ([#1595](https://github.com/open-mmlab/mmaction2/pull/1595))
-- Fix inference with ndarray ([#1603](https://github.com/open-mmlab/mmaction2/pull/1603))
-- Fix the log error when `IterBasedRunner` is used ([#1606](https://github.com/open-mmlab/mmaction2/pull/1606))
-
-## 0.23.0 (04/01/2022)
-
-**Highlights**
-
-- Support different seeds
-- Provide multi-node training & testing script
-- Update error log
-
-**New Features**
-
-- Support different seeds([#1502](https://github.com/open-mmlab/mmaction2/pull/1502))
-- Provide multi-node training & testing script([#1521](https://github.com/open-mmlab/mmaction2/pull/1521))
-- Update error log([#1546](https://github.com/open-mmlab/mmaction2/pull/1546))
-
-**Documentations**
-
-- Update gpus in Slowfast readme([#1497](https://github.com/open-mmlab/mmaction2/pull/1497))
-- Fix work_dir in multigrid config([#1498](https://github.com/open-mmlab/mmaction2/pull/1498))
-- Add sub bn docs([#1503](https://github.com/open-mmlab/mmaction2/pull/1503))
-- Add shortcycle sampler docs([#1513](https://github.com/open-mmlab/mmaction2/pull/1513))
-- Update Windows Declaration([#1520](https://github.com/open-mmlab/mmaction2/pull/1520))
-- Update the link for ST-GCN([#1544](https://github.com/open-mmlab/mmaction2/pull/1544))
-- Update install commands([#1549](https://github.com/open-mmlab/mmaction2/pull/1549))
-
-**Bug and Typo Fixes**
-
-- Update colab tutorial install cmds([#1522](https://github.com/open-mmlab/mmaction2/pull/1522))
-- Fix num_iters_per_epoch in analyze_logs.py([#1530](https://github.com/open-mmlab/mmaction2/pull/1530))
-- Fix distributed_sampler([#1532](https://github.com/open-mmlab/mmaction2/pull/1532))
-- Fix cd dir error([#1545](https://github.com/open-mmlab/mmaction2/pull/1545))
-- Update arg names([#1548](https://github.com/open-mmlab/mmaction2/pull/1548))
-
-**ModelZoo**
-
-## 0.22.0 (03/05/2022)
-
-**Highlights**
-
-- Support Multigrid training strategy
-- Support CPU training
-- Support audio demo
-- Support topk customizing in models/heads/base.py
-
-**New Features**
-
-- Support Multigrid training strategy([#1378](https://github.com/open-mmlab/mmaction2/pull/1378))
-- Support STGCN in demo_skeleton.py([#1391](https://github.com/open-mmlab/mmaction2/pull/1391))
-- Support CPU training([#1407](https://github.com/open-mmlab/mmaction2/pull/1407))
-- Support audio demo([#1425](https://github.com/open-mmlab/mmaction2/pull/1425))
-- Support topk customizing in models/heads/base.py([#1452](https://github.com/open-mmlab/mmaction2/pull/1452))
-
-**Documentations**
-
-- Add OpenMMLab platform([#1393](https://github.com/open-mmlab/mmaction2/pull/1393))
-- Update links([#1394](https://github.com/open-mmlab/mmaction2/pull/1394))
-- Update readme in configs([#1404](https://github.com/open-mmlab/mmaction2/pull/1404))
-- Update instructions to install mmcv-full([#1426](https://github.com/open-mmlab/mmaction2/pull/1426))
-- Add shortcut([#1433](https://github.com/open-mmlab/mmaction2/pull/1433))
-- Update modelzoo([#1439](https://github.com/open-mmlab/mmaction2/pull/1439))
-- add video_structuralize in readme([#1455](https://github.com/open-mmlab/mmaction2/pull/1455))
-- Update OpenMMLab repo information([#1482](https://github.com/open-mmlab/mmaction2/pull/1482))
-
-**Bug and Typo Fixes**
-
-- Update train.py([#1375](https://github.com/open-mmlab/mmaction2/pull/1375))
-- Fix printout bug([#1382](<(https://github.com/open-mmlab/mmaction2/pull/1382)>))
-- Update multi processing setting([#1395](https://github.com/open-mmlab/mmaction2/pull/1395))
-- Setup multi processing both in train and test([#1405](https://github.com/open-mmlab/mmaction2/pull/1405))
-- Fix bug in nondistributed multi-gpu training([#1406](https://github.com/open-mmlab/mmaction2/pull/1406))
-- Add variable fps in  ava_dataset.py([#1409](https://github.com/open-mmlab/mmaction2/pull/1409))
-- Only support distributed training([#1414](https://github.com/open-mmlab/mmaction2/pull/1414))
-- Set test_mode for AVA configs([#1432](https://github.com/open-mmlab/mmaction2/pull/1432))
-- Support single label([#1434](https://github.com/open-mmlab/mmaction2/pull/1434))
-- Add check copyright([#1447](https://github.com/open-mmlab/mmaction2/pull/1447))
-- Support Windows CI([#1448](https://github.com/open-mmlab/mmaction2/pull/1448))
-- Fix wrong device of class_weight in models/losses/cross_entropy_loss.py([#1457](https://github.com/open-mmlab/mmaction2/pull/1457))
-- Fix bug caused by distributed([#1459](https://github.com/open-mmlab/mmaction2/pull/1459))
-- Update readme([#1460](https://github.com/open-mmlab/mmaction2/pull/1460))
-- Fix lint caused by colab automatic upload([#1461](https://github.com/open-mmlab/mmaction2/pull/1461))
-- Refine CI([#1471](https://github.com/open-mmlab/mmaction2/pull/1471))
-- Update pre-commit([#1474](https://github.com/open-mmlab/mmaction2/pull/1474))
-- Add deprecation message for deploy tool([#1483](https://github.com/open-mmlab/mmaction2/pull/1483))
-
-**ModelZoo**
-
-- Support slowfast_steplr([#1421](https://github.com/open-mmlab/mmaction2/pull/1421))
-
-## 0.21.0 (31/12/2021)
-
-**Highlights**
-
-- Support 2s-AGCN
-- Support publish models in Windows
-- Improve some sthv1 related models
-- Support BABEL
-
-**New Features**
-
-- Support 2s-AGCN([#1248](https://github.com/open-mmlab/mmaction2/pull/1248))
-- Support skip postproc in ntu_pose_extraction([#1295](https://github.com/open-mmlab/mmaction2/pull/1295))
-- Support publish models in Windows([#1325](https://github.com/open-mmlab/mmaction2/pull/1325))
-- Add copyright checkhook in pre-commit-config([#1344](https://github.com/open-mmlab/mmaction2/pull/1344))
-
-**Documentations**
-
-- Add MMFlow ([#1273](https://github.com/open-mmlab/mmaction2/pull/1273))
-- Revise README.md and add projects.md ([#1286](https://github.com/open-mmlab/mmaction2/pull/1286))
-- Add 2s-AGCN in Updates([#1289](https://github.com/open-mmlab/mmaction2/pull/1289))
-- Add MMFewShot([#1300](https://github.com/open-mmlab/mmaction2/pull/1300))
-- Add MMHuman3d([#1304](https://github.com/open-mmlab/mmaction2/pull/1304))
-- Update pre-commit([#1313](https://github.com/open-mmlab/mmaction2/pull/1313))
-- Use share menu from the theme instead([#1328](https://github.com/open-mmlab/mmaction2/pull/1328))
-- Update installation command([#1340](https://github.com/open-mmlab/mmaction2/pull/1340))
-
-**Bug and Typo Fixes**
-
-- Update the inference part in notebooks([#1256](https://github.com/open-mmlab/mmaction2/pull/1256))
-- Update the map_location([#1262](<(https://github.com/open-mmlab/mmaction2/pull/1262)>))
-- Fix bug that start_index is not used in RawFrameDecode([#1278](https://github.com/open-mmlab/mmaction2/pull/1278))
-- Fix bug in init_random_seed([#1282](https://github.com/open-mmlab/mmaction2/pull/1282))
-- Fix bug in setup.py([#1303](https://github.com/open-mmlab/mmaction2/pull/1303))
-- Fix interrogate error in workflows([#1305](https://github.com/open-mmlab/mmaction2/pull/1305))
-- Fix typo in slowfast config([#1309](https://github.com/open-mmlab/mmaction2/pull/1309))
-- Cancel previous runs that are not completed([#1327](https://github.com/open-mmlab/mmaction2/pull/1327))
-- Fix missing skip_postproc parameter([#1347](https://github.com/open-mmlab/mmaction2/pull/1347))
-- Update ssn.py([#1355](https://github.com/open-mmlab/mmaction2/pull/1355))
-- Use latest youtube-dl([#1357](https://github.com/open-mmlab/mmaction2/pull/1357))
-- Fix test-best([#1362](https://github.com/open-mmlab/mmaction2/pull/1362))
-
-**ModelZoo**
-
-- Improve some sthv1 related models([#1306](https://github.com/open-mmlab/mmaction2/pull/1306))
-- Support BABEL([#1332](https://github.com/open-mmlab/mmaction2/pull/1332))
-
-## 0.20.0 (07/10/2021)
-
-**Highlights**
-
-- Support TorchServe
-- Add video structuralize demo
-- Support using 3D skeletons for skeleton-based action recognition
-- Benchmark PoseC3D on UCF and HMDB
-
-**New Features**
-
-- Support TorchServe ([#1212](https://github.com/open-mmlab/mmaction2/pull/1212))
-- Support 3D skeletons pre-processing ([#1218](https://github.com/open-mmlab/mmaction2/pull/1218))
-- Support video structuralize demo ([#1197](https://github.com/open-mmlab/mmaction2/pull/1197))
-
-**Documentations**
-
-- Revise README.md and add projects.md ([#1214](https://github.com/open-mmlab/mmaction2/pull/1214))
-- Add CN docs for Skeleton dataset, PoseC3D and ST-GCN ([#1228](https://github.com/open-mmlab/mmaction2/pull/1228), [#1237](https://github.com/open-mmlab/mmaction2/pull/1237), [#1236](https://github.com/open-mmlab/mmaction2/pull/1236))
-- Add tutorial for custom dataset training for skeleton-based action recognition ([#1234](https://github.com/open-mmlab/mmaction2/pull/1234))
-
-**Bug and Typo Fixes**
-
-- Fix tutorial link ([#1219](https://github.com/open-mmlab/mmaction2/pull/1219))
-- Fix GYM links ([#1224](https://github.com/open-mmlab/mmaction2/pull/1224))
-
-**ModelZoo**
-
-- Benchmark PoseC3D on UCF and HMDB ([#1223](https://github.com/open-mmlab/mmaction2/pull/1223))
-- Add ST-GCN + 3D skeleton model for NTU60-XSub ([#1236](https://github.com/open-mmlab/mmaction2/pull/1236))
-
-## 0.19.0 (07/10/2021)
-
-**Highlights**
-
-- Support ST-GCN
-- Refactor the inference API
-- Add code spell check hook
-
-**New Features**
-
-- Support ST-GCN ([#1123](https://github.com/open-mmlab/mmaction2/pull/1123))
-
-**Improvement**
-
-- Add label maps for every dataset ([#1127](https://github.com/open-mmlab/mmaction2/pull/1127))
-- Remove useless code MultiGroupCrop ([#1180](https://github.com/open-mmlab/mmaction2/pull/1180))
-- Refactor Inference API ([#1191](https://github.com/open-mmlab/mmaction2/pull/1191))
-- Add code spell check hook ([#1208](https://github.com/open-mmlab/mmaction2/pull/1208))
-- Use docker in CI ([#1159](https://github.com/open-mmlab/mmaction2/pull/1159))
-
-**Documentations**
-
-- Update metafiles to new OpenMMLAB protocols ([#1134](https://github.com/open-mmlab/mmaction2/pull/1134))
-- Switch to new doc style ([#1160](https://github.com/open-mmlab/mmaction2/pull/1160))
-- Improve the ERROR message ([#1203](https://github.com/open-mmlab/mmaction2/pull/1203))
-- Fix invalid URL in getting_started ([#1169](https://github.com/open-mmlab/mmaction2/pull/1169))
-
-**Bug and Typo Fixes**
-
-- Compatible with new MMClassification ([#1139](https://github.com/open-mmlab/mmaction2/pull/1139))
-- Add missing runtime dependencies ([#1144](https://github.com/open-mmlab/mmaction2/pull/1144))
-- Fix THUMOS tag proposals path ([#1156](https://github.com/open-mmlab/mmaction2/pull/1156))
-- Fix LoadHVULabel ([#1194](https://github.com/open-mmlab/mmaction2/pull/1194))
-- Switch the default value of `persistent_workers` to False ([#1202](https://github.com/open-mmlab/mmaction2/pull/1202))
-- Fix `_freeze_stages` for MobileNetV2 ([#1193](https://github.com/open-mmlab/mmaction2/pull/1193))
-- Fix resume when building rawframes ([#1150](https://github.com/open-mmlab/mmaction2/pull/1150))
-- Fix device bug for class weight ([#1188](https://github.com/open-mmlab/mmaction2/pull/1188))
-- Correct Arg names in extract_audio.py ([#1148](https://github.com/open-mmlab/mmaction2/pull/1148))
-
-**ModelZoo**
-
-- Add TSM-MobileNetV2 ported from TSM ([#1163](https://github.com/open-mmlab/mmaction2/pull/1163))
-- Add ST-GCN for NTURGB+D-XSub-60 ([#1123](https://github.com/open-mmlab/mmaction2/pull/1123))
-
-## 0.18.0 (02/09/2021)
-
-**Improvement**
-
-- Add CopyRight ([#1099](https://github.com/open-mmlab/mmaction2/pull/1099))
-- Support NTU Pose Extraction ([#1076](https://github.com/open-mmlab/mmaction2/pull/1076))
-- Support Caching in RawFrameDecode ([#1078](https://github.com/open-mmlab/mmaction2/pull/1078))
-- Add citations & Support python3.9 CI & Use fixed-version sphinx ([#1125](https://github.com/open-mmlab/mmaction2/pull/1125))
-
-**Documentations**
-
-- Add Descriptions of PoseC3D dataset ([#1053](https://github.com/open-mmlab/mmaction2/pull/1053))
-
-**Bug and Typo Fixes**
-
-- Fix SSV2 checkpoints ([#1101](https://github.com/open-mmlab/mmaction2/pull/1101))
-- Fix CSN normalization ([#1116](https://github.com/open-mmlab/mmaction2/pull/1116))
-- Fix typo ([#1121](https://github.com/open-mmlab/mmaction2/pull/1121))
-- Fix new_crop_quadruple bug ([#1108](https://github.com/open-mmlab/mmaction2/pull/1108))
-
-## 0.17.0 (03/08/2021)
-
-**Highlights**
-
-- Support PyTorch 1.9
-- Support Pytorchvideo Transforms
-- Support PreciseBN
-
-**New Features**
-
-- Support Pytorchvideo Transforms ([#1008](https://github.com/open-mmlab/mmaction2/pull/1008))
-- Support PreciseBN ([#1038](https://github.com/open-mmlab/mmaction2/pull/1038))
-
-**Improvements**
-
-- Remove redundant augmentations in config files ([#996](https://github.com/open-mmlab/mmaction2/pull/996))
-- Make resource directory to hold common resource pictures ([#1011](https://github.com/open-mmlab/mmaction2/pull/1011))
-- Remove deprecated FrameSelector ([#1010](https://github.com/open-mmlab/mmaction2/pull/1010))
-- Support Concat Dataset ([#1000](https://github.com/open-mmlab/mmaction2/pull/1000))
-- Add `to-mp4` option to resize_videos.py ([#1021](https://github.com/open-mmlab/mmaction2/pull/1021))
-- Add option to keep tail frames ([#1050](https://github.com/open-mmlab/mmaction2/pull/1050))
-- Update MIM support ([#1061](https://github.com/open-mmlab/mmaction2/pull/1061))
-- Calculate Top-K accurate and inaccurate classes ([#1047](https://github.com/open-mmlab/mmaction2/pull/1047))
-
-**Bug and Typo Fixes**
-
-- Fix bug in PoseC3D demo ([#1009](https://github.com/open-mmlab/mmaction2/pull/1009))
-- Fix some problems in resize_videos.py ([#1012](https://github.com/open-mmlab/mmaction2/pull/1012))
-- Support torch1.9 ([#1015](https://github.com/open-mmlab/mmaction2/pull/1015))
-- Remove redundant code in CI ([#1046](https://github.com/open-mmlab/mmaction2/pull/1046))
-- Fix bug about persistent_workers ([#1044](https://github.com/open-mmlab/mmaction2/pull/1044))
-- Support TimeSformer feature extraction ([#1035](https://github.com/open-mmlab/mmaction2/pull/1035))
-- Fix ColorJitter ([#1025](https://github.com/open-mmlab/mmaction2/pull/1025))
-
-**ModelZoo**
-
-- Add TSM-R50 sthv1 models trained by PytorchVideo RandAugment and AugMix ([#1008](https://github.com/open-mmlab/mmaction2/pull/1008))
-- Update SlowOnly SthV1 checkpoints ([#1034](https://github.com/open-mmlab/mmaction2/pull/1034))
-- Add SlowOnly Kinetics400 checkpoints trained with Precise-BN ([#1038](https://github.com/open-mmlab/mmaction2/pull/1038))
-- Add CSN-R50 from scratch checkpoints ([#1045](https://github.com/open-mmlab/mmaction2/pull/1045))
-- TPN Kinetics-400 Checkpoints trained with the new ColorJitter ([#1025](https://github.com/open-mmlab/mmaction2/pull/1025))
-
-**Documentation**
-
-- Add Chinese translation of feature_extraction.md ([#1020](https://github.com/open-mmlab/mmaction2/pull/1020))
-- Fix the code snippet in getting_started.md ([#1023](https://github.com/open-mmlab/mmaction2/pull/1023))
-- Fix TANet config table ([#1028](https://github.com/open-mmlab/mmaction2/pull/1028))
-- Add description to PoseC3D dataset ([#1053](https://github.com/open-mmlab/mmaction2/pull/1053))
-
-## 0.16.0 (01/07/2021)
-
-**Highlights**
-
-- Support using backbone from pytorch-image-models(timm)
-- Support PIMS Decoder
-- Demo for skeleton-based action recognition
-- Support Timesformer
-
-**New Features**
-
-- Support using backbones from pytorch-image-models(timm) for TSN ([#880](https://github.com/open-mmlab/mmaction2/pull/880))
-- Support torchvision transformations in preprocessing pipelines ([#972](https://github.com/open-mmlab/mmaction2/pull/972))
-- Demo for skeleton-based action recognition ([#972](https://github.com/open-mmlab/mmaction2/pull/972))
-- Support Timesformer ([#839](https://github.com/open-mmlab/mmaction2/pull/839))
-
-**Improvements**
-
-- Add a tool to find invalid videos ([#907](https://github.com/open-mmlab/mmaction2/pull/907), [#950](https://github.com/open-mmlab/mmaction2/pull/950))
-- Add an option to specify spectrogram_type ([#909](https://github.com/open-mmlab/mmaction2/pull/909))
-- Add json output to video demo ([#906](https://github.com/open-mmlab/mmaction2/pull/906))
-- Add MIM related docs ([#918](https://github.com/open-mmlab/mmaction2/pull/918))
-- Rename lr to scheduler ([#916](https://github.com/open-mmlab/mmaction2/pull/916))
-- Support `--cfg-options` for demos ([#911](https://github.com/open-mmlab/mmaction2/pull/911))
-- Support number counting for flow-wise filename template ([#922](https://github.com/open-mmlab/mmaction2/pull/922))
-- Add Chinese tutorial ([#941](https://github.com/open-mmlab/mmaction2/pull/941))
-- Change ResNet3D default values ([#939](https://github.com/open-mmlab/mmaction2/pull/939))
-- Adjust script structure ([#935](https://github.com/open-mmlab/mmaction2/pull/935))
-- Add font color to args in long_video_demo ([#947](https://github.com/open-mmlab/mmaction2/pull/947))
-- Polish code style with Pylint ([#908](https://github.com/open-mmlab/mmaction2/pull/908))
-- Support PIMS Decoder ([#946](https://github.com/open-mmlab/mmaction2/pull/946))
-- Improve Metafiles ([#956](https://github.com/open-mmlab/mmaction2/pull/956), [#979](https://github.com/open-mmlab/mmaction2/pull/979), [#966](https://github.com/open-mmlab/mmaction2/pull/966))
-- Add links to download Kinetics400 validation ([#920](https://github.com/open-mmlab/mmaction2/pull/920))
-- Audit the usage of shutil.rmtree ([#943](https://github.com/open-mmlab/mmaction2/pull/943))
-- Polish localizer related codes([#913](https://github.com/open-mmlab/mmaction2/pull/913))
-
-**Bug and Typo Fixes**
-
-- Fix spatiotemporal detection demo ([#899](https://github.com/open-mmlab/mmaction2/pull/899))
-- Fix docstring for 3D inflate ([#925](https://github.com/open-mmlab/mmaction2/pull/925))
-- Fix bug of writing text to video with TextClip ([#952](https://github.com/open-mmlab/mmaction2/pull/952))
-- Fix mmcv install in CI ([#977](https://github.com/open-mmlab/mmaction2/pull/977))
-
-**ModelZoo**
-
-- Add TSN with Swin Transformer backbone as an example for using pytorch-image-models(timm) backbones ([#880](https://github.com/open-mmlab/mmaction2/pull/880))
-- Port CSN checkpoints from VMZ ([#945](https://github.com/open-mmlab/mmaction2/pull/945))
-- Release various checkpoints for UCF101, HMDB51 and Sthv1 ([#938](https://github.com/open-mmlab/mmaction2/pull/938))
-- Support Timesformer ([#839](https://github.com/open-mmlab/mmaction2/pull/839))
-- Update TSM modelzoo ([#981](https://github.com/open-mmlab/mmaction2/pull/981))
-
-## 0.15.0 (31/05/2021)
-
-**Highlights**
-
-- Support PoseC3D
-- Support ACRN
-- Support MIM
-
-**New Features**
-
-- Support PoseC3D ([#786](https://github.com/open-mmlab/mmaction2/pull/786), [#890](https://github.com/open-mmlab/mmaction2/pull/890))
-- Support MIM ([#870](https://github.com/open-mmlab/mmaction2/pull/870))
-- Support ACRN and Focal Loss ([#891](https://github.com/open-mmlab/mmaction2/pull/891))
-- Support Jester dataset ([#864](https://github.com/open-mmlab/mmaction2/pull/864))
-
-**Improvements**
-
-- Add `metric_options` for evaluation to docs ([#873](https://github.com/open-mmlab/mmaction2/pull/873))
-- Support creating a new label map based on custom classes for demos about spatio temporal demo ([#879](https://github.com/open-mmlab/mmaction2/pull/879))
-- Improve document about AVA dataset preparation ([#878](https://github.com/open-mmlab/mmaction2/pull/878))
-- Provide a script to extract clip-level feature ([#856](https://github.com/open-mmlab/mmaction2/pull/856))
-
-**Bug and Typo Fixes**
-
-- Fix issues about resume ([#877](https://github.com/open-mmlab/mmaction2/pull/877), [#878](https://github.com/open-mmlab/mmaction2/pull/878))
-- Correct the key name of `eval_results` dictionary for metric 'mmit_mean_average_precision' ([#885](https://github.com/open-mmlab/mmaction2/pull/885))
-
-**ModelZoo**
-
-- Support Jester dataset ([#864](https://github.com/open-mmlab/mmaction2/pull/864))
-- Support ACRN and Focal Loss ([#891](https://github.com/open-mmlab/mmaction2/pull/891))
-
-## 0.14.0 (30/04/2021)
-
-**Highlights**
-
-- Support TRN
-- Support Diving48
-
-**New Features**
-
-- Support TRN ([#755](https://github.com/open-mmlab/mmaction2/pull/755))
-- Support Diving48 ([#835](https://github.com/open-mmlab/mmaction2/pull/835))
-- Support Webcam Demo for Spatio-temporal Action Detection Models ([#795](https://github.com/open-mmlab/mmaction2/pull/795))
-
-**Improvements**
-
-- Add softmax option for pytorch2onnx tool ([#781](https://github.com/open-mmlab/mmaction2/pull/781))
-- Support TRN ([#755](https://github.com/open-mmlab/mmaction2/pull/755))
-- Test with onnx models and TensorRT engines ([#758](https://github.com/open-mmlab/mmaction2/pull/758))
-- Speed up AVA Testing ([#784](https://github.com/open-mmlab/mmaction2/pull/784))
-- Add `self.with_neck` attribute ([#796](https://github.com/open-mmlab/mmaction2/pull/796))
-- Update installation document ([#798](https://github.com/open-mmlab/mmaction2/pull/798))
-- Use a random master port ([#809](https://github.com/open-mmlab/mmaction2/pull/8098))
-- Update AVA processing data document ([#801](https://github.com/open-mmlab/mmaction2/pull/801))
-- Refactor spatio-temporal augmentation ([#782](https://github.com/open-mmlab/mmaction2/pull/782))
-- Add QR code in CN README ([#812](https://github.com/open-mmlab/mmaction2/pull/812))
-- Add Alternative way to download Kinetics ([#817](https://github.com/open-mmlab/mmaction2/pull/817), [#822](https://github.com/open-mmlab/mmaction2/pull/822))
-- Refactor Sampler ([#790](https://github.com/open-mmlab/mmaction2/pull/790))
-- Use EvalHook in MMCV with backward compatibility ([#793](https://github.com/open-mmlab/mmaction2/pull/793))
-- Use MMCV Model Registry ([#843](https://github.com/open-mmlab/mmaction2/pull/843))
-
-**Bug and Typo Fixes**
-
-- Fix a bug in pytorch2onnx.py when `num_classes <= 4` ([#800](https://github.com/open-mmlab/mmaction2/pull/800), [#824](https://github.com/open-mmlab/mmaction2/pull/824))
-- Fix `demo_spatiotemporal_det.py` error ([#803](https://github.com/open-mmlab/mmaction2/pull/803), [#805](https://github.com/open-mmlab/mmaction2/pull/805))
-- Fix loading config bugs when resume ([#820](https://github.com/open-mmlab/mmaction2/pull/820))
-- Make HMDB51 annotation generation more robust ([#811](https://github.com/open-mmlab/mmaction2/pull/811))
-
-**ModelZoo**
-
-- Update checkpoint for 256 height in something-V2 ([#789](https://github.com/open-mmlab/mmaction2/pull/789))
-- Support Diving48 ([#835](https://github.com/open-mmlab/mmaction2/pull/835))
-
-## 0.13.0 (31/03/2021)
-
-**Highlights**
-
-- Support LFB
-- Support using backbone from MMCls/TorchVision
-- Add Chinese documentation
-
-**New Features**
-
-- Support LFB ([#553](https://github.com/open-mmlab/mmaction2/pull/553))
-- Support using backbones from MMCls for TSN ([#679](https://github.com/open-mmlab/mmaction2/pull/679))
-- Support using backbones from TorchVision for TSN ([#720](https://github.com/open-mmlab/mmaction2/pull/720))
-- Support Mixup and Cutmix for recognizers ([#681](https://github.com/open-mmlab/mmaction2/pull/681))
-- Support Chinese documentation ([#665](https://github.com/open-mmlab/mmaction2/pull/665), [#680](https://github.com/open-mmlab/mmaction2/pull/680), [#689](https://github.com/open-mmlab/mmaction2/pull/689), [#701](https://github.com/open-mmlab/mmaction2/pull/701), [#702](https://github.com/open-mmlab/mmaction2/pull/702), [#703](https://github.com/open-mmlab/mmaction2/pull/703), [#706](https://github.com/open-mmlab/mmaction2/pull/706), [#716](https://github.com/open-mmlab/mmaction2/pull/716), [#717](https://github.com/open-mmlab/mmaction2/pull/717), [#731](https://github.com/open-mmlab/mmaction2/pull/731), [#733](https://github.com/open-mmlab/mmaction2/pull/733), [#735](https://github.com/open-mmlab/mmaction2/pull/735), [#736](https://github.com/open-mmlab/mmaction2/pull/736), [#737](https://github.com/open-mmlab/mmaction2/pull/737), [#738](https://github.com/open-mmlab/mmaction2/pull/738), [#739](https://github.com/open-mmlab/mmaction2/pull/739), [#740](https://github.com/open-mmlab/mmaction2/pull/740), [#742](https://github.com/open-mmlab/mmaction2/pull/742), [#752](https://github.com/open-mmlab/mmaction2/pull/752), [#759](https://github.com/open-mmlab/mmaction2/pull/759), [#761](https://github.com/open-mmlab/mmaction2/pull/761), [#772](https://github.com/open-mmlab/mmaction2/pull/772), [#775](https://github.com/open-mmlab/mmaction2/pull/775))
-
-**Improvements**
-
-- Add slowfast config/json/log/ckpt for training custom classes of AVA ([#678](https://github.com/open-mmlab/mmaction2/pull/678))
-- Set RandAugment as Imgaug default transforms ([#585](https://github.com/open-mmlab/mmaction2/pull/585))
-- Add `--test-last` & `--test-best` for `tools/train.py` to test checkpoints after training ([#608](https://github.com/open-mmlab/mmaction2/pull/608))
-- Add fcn_testing in TPN ([#684](https://github.com/open-mmlab/mmaction2/pull/684))
-- Remove redundant recall functions ([#741](https://github.com/open-mmlab/mmaction2/pull/741))
-- Recursively remove pretrained step for testing ([#695](https://github.com/open-mmlab/mmaction2/pull/695))
-- Improve demo by limiting inference fps ([#668](https://github.com/open-mmlab/mmaction2/pull/668))
-
-**Bug and Typo Fixes**
-
-- Fix a bug about multi-class in VideoDataset ([#723](https://github.com/open-mmlab/mmaction2/pull/678))
-- Reverse key-value in anet filelist generation ([#686](https://github.com/open-mmlab/mmaction2/pull/686))
-- Fix flow norm cfg typo ([#693](https://github.com/open-mmlab/mmaction2/pull/693))
-
-**ModelZoo**
-
-- Add LFB for AVA2.1 ([#553](https://github.com/open-mmlab/mmaction2/pull/553))
-- Add TSN with ResNeXt-101-32x4d backbone as an example for using MMCls backbones ([#679](https://github.com/open-mmlab/mmaction2/pull/679))
-- Add TSN with Densenet161 backbone as an example for using TorchVision backbones ([#720](https://github.com/open-mmlab/mmaction2/pull/720))
-- Add slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb ([#690](https://github.com/open-mmlab/mmaction2/pull/690))
-- Add slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb ([#704](https://github.com/open-mmlab/mmaction2/pull/704))
-- Add slowonly_nl_kinetics_pretrained_r50_4x16x1(8x8x1)\_20e_ava_rgb ([#730](https://github.com/open-mmlab/mmaction2/pull/730))
-
-## 0.12.0 (28/02/2021)
-
-**Highlights**
-
-- Support TSM-MobileNetV2
-- Support TANet
-- Support GPU Normalize
-
-**New Features**
-
-- Support TSM-MobileNetV2 ([#415](https://github.com/open-mmlab/mmaction2/pull/415))
-- Support flip with label mapping ([#591](https://github.com/open-mmlab/mmaction2/pull/591))
-- Add seed option for sampler ([#642](https://github.com/open-mmlab/mmaction2/pull/642))
-- Support GPU Normalize ([#586](https://github.com/open-mmlab/mmaction2/pull/586))
-- Support TANet ([#595](https://github.com/open-mmlab/mmaction2/pull/595))
-
-**Improvements**
-
-- Training custom classes of ava dataset ([#555](https://github.com/open-mmlab/mmaction2/pull/555))
-- Add CN README in homepage ([#592](https://github.com/open-mmlab/mmaction2/pull/592), [#594](https://github.com/open-mmlab/mmaction2/pull/594))
-- Support soft label for CrossEntropyLoss ([#625](https://github.com/open-mmlab/mmaction2/pull/625))
-- Refactor config: Specify `train_cfg` and `test_cfg` in `model` ([#629](https://github.com/open-mmlab/mmaction2/pull/629))
-- Provide an alternative way to download older kinetics annotations ([#597](https://github.com/open-mmlab/mmaction2/pull/597))
-- Update FAQ for
-  - 1). data pipeline about video and frames ([#598](https://github.com/open-mmlab/mmaction2/pull/598))
-  - 2). how to show results ([#598](https://github.com/open-mmlab/mmaction2/pull/598))
-  - 3). batch size setting for batchnorm ([#657](https://github.com/open-mmlab/mmaction2/pull/657))
-  - 4). how to fix stages of backbone when finetuning models ([#658](https://github.com/open-mmlab/mmaction2/pull/658))
-- Modify default value of `save_best` ([#600](https://github.com/open-mmlab/mmaction2/pull/600))
-- Use BibTex rather than latex in markdown ([#607](https://github.com/open-mmlab/mmaction2/pull/607))
-- Add warnings of uninstalling mmdet and supplementary documents ([#624](https://github.com/open-mmlab/mmaction2/pull/624))
-- Support soft label for CrossEntropyLoss ([#625](https://github.com/open-mmlab/mmaction2/pull/625))
-
-**Bug and Typo Fixes**
-
-- Fix value of `pem_low_temporal_iou_threshold` in BSN ([#556](https://github.com/open-mmlab/mmaction2/pull/556))
-- Fix ActivityNet download script ([#601](https://github.com/open-mmlab/mmaction2/pull/601))
-
-**ModelZoo**
-
-- Add TSM-MobileNetV2 for Kinetics400 ([#415](https://github.com/open-mmlab/mmaction2/pull/415))
-- Add deeper SlowFast models ([#605](https://github.com/open-mmlab/mmaction2/pull/605))
-
-## 0.11.0 (31/01/2021)
-
-**Highlights**
-
-- Support imgaug
-- Support spatial temporal demo
-- Refactor EvalHook, config structure, unittest structure
-
-**New Features**
-
-- Support [imgaug](https://imgaug.readthedocs.io/en/latest/index.html) for augmentations in the data pipeline ([#492](https://github.com/open-mmlab/mmaction2/pull/492))
-- Support setting `max_testing_views` for extremely large models to save GPU memory used ([#511](https://github.com/open-mmlab/mmaction2/pull/511))
-- Add spatial temporal demo ([#547](https://github.com/open-mmlab/mmaction2/pull/547), [#566](https://github.com/open-mmlab/mmaction2/pull/566))
-
-**Improvements**
-
-- Refactor EvalHook ([#395](https://github.com/open-mmlab/mmaction2/pull/395))
-- Refactor AVA hook ([#567](https://github.com/open-mmlab/mmaction2/pull/567))
-- Add repo citation ([#545](https://github.com/open-mmlab/mmaction2/pull/545))
-- Add dataset size of Kinetics400 ([#503](https://github.com/open-mmlab/mmaction2/pull/503))
-- Add lazy operation docs ([#504](https://github.com/open-mmlab/mmaction2/pull/504))
-- Add class_weight for CrossEntropyLoss and BCELossWithLogits ([#509](https://github.com/open-mmlab/mmaction2/pull/509))
-- add some explanation about the resampling in slowfast ([#502](https://github.com/open-mmlab/mmaction2/pull/502))
-- Modify paper title in README.md ([#512](https://github.com/open-mmlab/mmaction2/pull/512))
-- Add alternative ways to download Kinetics ([#521](https://github.com/open-mmlab/mmaction2/pull/521))
-- Add OpenMMLab projects link in README ([#530](https://github.com/open-mmlab/mmaction2/pull/530))
-- Change default preprocessing to shortedge to 256 ([#538](https://github.com/open-mmlab/mmaction2/pull/538))
-- Add config tag in dataset README ([#540](https://github.com/open-mmlab/mmaction2/pull/540))
-- Add solution for markdownlint installation issue ([#497](https://github.com/open-mmlab/mmaction2/pull/497))
-- Add dataset overview in readthedocs ([#548](https://github.com/open-mmlab/mmaction2/pull/548))
-- Modify the trigger mode of the warnings of missing mmdet ([#583](https://github.com/open-mmlab/mmaction2/pull/583))
-- Refactor config structure ([#488](https://github.com/open-mmlab/mmaction2/pull/488), [#572](https://github.com/open-mmlab/mmaction2/pull/572))
-- Refactor unittest structure ([#433](https://github.com/open-mmlab/mmaction2/pull/433))
-
-**Bug and Typo Fixes**
-
-- Fix a bug about ava dataset validation ([#527](https://github.com/open-mmlab/mmaction2/pull/527))
-- Fix a bug about ResNet pretrain weight initialization ([#582](https://github.com/open-mmlab/mmaction2/pull/582))
-- Fix a bug in CI due to MMCV index ([#495](https://github.com/open-mmlab/mmaction2/pull/495))
-- Remove invalid links of MiT and MMiT ([#516](https://github.com/open-mmlab/mmaction2/pull/516))
-- Fix frame rate bug for AVA preparation ([#576](https://github.com/open-mmlab/mmaction2/pull/576))
-
-**ModelZoo**
-
-## 0.10.0 (31/12/2020)
-
-**Highlights**
-
-- Support Spatio-Temporal Action Detection (AVA)
-- Support precise BN
-
-**New Features**
-
-- Support precise BN ([#501](https://github.com/open-mmlab/mmaction2/pull/501/))
-- Support Spatio-Temporal Action Detection (AVA) ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
-- Support to return feature maps in `inference_recognizer` ([#458](https://github.com/open-mmlab/mmaction2/pull/458))
-
-**Improvements**
-
-- Add arg `stride` to long_video_demo.py, to make inference faster ([#468](https://github.com/open-mmlab/mmaction2/pull/468))
-- Support training and testing for Spatio-Temporal Action Detection ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
-- Fix CI due to pip upgrade ([#454](https://github.com/open-mmlab/mmaction2/pull/454))
-- Add markdown lint in pre-commit hook ([#255](https://github.com/open-mmlab/mmaction2/pull/225))
-- Speed up confusion matrix calculation ([#465](https://github.com/open-mmlab/mmaction2/pull/465))
-- Use title case in modelzoo statistics ([#456](https://github.com/open-mmlab/mmaction2/pull/456))
-- Add FAQ documents for easy troubleshooting. ([#413](https://github.com/open-mmlab/mmaction2/pull/413), [#420](https://github.com/open-mmlab/mmaction2/pull/420), [#439](https://github.com/open-mmlab/mmaction2/pull/439))
-- Support Spatio-Temporal Action Detection with context ([#471](https://github.com/open-mmlab/mmaction2/pull/471))
-- Add class weight for CrossEntropyLoss and BCELossWithLogits ([#509](https://github.com/open-mmlab/mmaction2/pull/509))
-- Add Lazy OPs docs ([#504](https://github.com/open-mmlab/mmaction2/pull/504))
-
-**Bug and Typo Fixes**
-
-- Fix typo in default argument of BaseHead ([#446](https://github.com/open-mmlab/mmaction2/pull/446))
-- Fix potential bug about `output_config` overwrite ([#463](https://github.com/open-mmlab/mmaction2/pull/463))
-
-**ModelZoo**
-
-- Add SlowOnly, SlowFast for AVA2.1 ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
-
-## 0.9.0 (30/11/2020)
-
-**Highlights**
-
-- Support GradCAM utils for recognizers
-- Support ResNet Audio model
-
-**New Features**
-
-- Automatically add modelzoo statistics to readthedocs ([#327](https://github.com/open-mmlab/mmaction2/pull/327))
-- Support GYM99 ([#331](https://github.com/open-mmlab/mmaction2/pull/331), [#336](https://github.com/open-mmlab/mmaction2/pull/336))
-- Add AudioOnly Pathway from AVSlowFast. ([#355](https://github.com/open-mmlab/mmaction2/pull/355))
-- Add GradCAM utils for recognizer ([#324](https://github.com/open-mmlab/mmaction2/pull/324))
-- Add print config script ([#345](https://github.com/open-mmlab/mmaction2/pull/345))
-- Add online motion vector decoder ([#291](https://github.com/open-mmlab/mmaction2/pull/291))
-
-**Improvements**
-
-- Support PyTorch 1.7 in CI ([#312](https://github.com/open-mmlab/mmaction2/pull/312))
-- Support to predict different labels in a long video ([#274](https://github.com/open-mmlab/mmaction2/pull/274))
-- Update docs bout test crops ([#359](https://github.com/open-mmlab/mmaction2/pull/359))
-- Polish code format using pylint manually ([#338](https://github.com/open-mmlab/mmaction2/pull/338))
-- Update unittest coverage ([#358](https://github.com/open-mmlab/mmaction2/pull/358), [#322](https://github.com/open-mmlab/mmaction2/pull/322), [#325](https://github.com/open-mmlab/mmaction2/pull/325))
-- Add random seed for building filelists ([#323](https://github.com/open-mmlab/mmaction2/pull/323))
-- Update colab tutorial ([#367](https://github.com/open-mmlab/mmaction2/pull/367))
-- set default batch_size of evaluation and testing to 1 ([#250](https://github.com/open-mmlab/mmaction2/pull/250))
-- Rename the preparation docs to `README.md` ([#388](https://github.com/open-mmlab/mmaction2/pull/388))
-- Move docs about demo to `demo/README.md` ([#329](https://github.com/open-mmlab/mmaction2/pull/329))
-- Remove redundant code in `tools/test.py` ([#310](https://github.com/open-mmlab/mmaction2/pull/310))
-- Automatically calculate number of test clips for Recognizer2D ([#359](https://github.com/open-mmlab/mmaction2/pull/359))
-
-**Bug and Typo Fixes**
-
-- Fix rename Kinetics classnames bug ([#384](https://github.com/open-mmlab/mmaction2/pull/384))
-- Fix a bug in BaseDataset when `data_prefix` is None ([#314](https://github.com/open-mmlab/mmaction2/pull/314))
-- Fix a bug about `tmp_folder` in `OpenCVInit` ([#357](https://github.com/open-mmlab/mmaction2/pull/357))
-- Fix `get_thread_id` when not using disk as backend ([#354](https://github.com/open-mmlab/mmaction2/pull/354), [#357](https://github.com/open-mmlab/mmaction2/pull/357))
-- Fix the bug of HVU object `num_classes` from 1679 to 1678 ([#307](https://github.com/open-mmlab/mmaction2/pull/307))
-- Fix typo in `export_model.md` ([#399](https://github.com/open-mmlab/mmaction2/pull/399))
-- Fix OmniSource training configs ([#321](https://github.com/open-mmlab/mmaction2/pull/321))
-- Fix Issue #306: Bug of SampleAVAFrames ([#317](https://github.com/open-mmlab/mmaction2/pull/317))
-
-**ModelZoo**
-
-- Add SlowOnly model for GYM99, both RGB and Flow ([#336](https://github.com/open-mmlab/mmaction2/pull/336))
-- Add auto modelzoo statistics in readthedocs ([#327](https://github.com/open-mmlab/mmaction2/pull/327))
-- Add TSN for HMDB51 pretrained on Kinetics400, Moments in Time and ImageNet. ([#372](https://github.com/open-mmlab/mmaction2/pull/372))
-
-## v0.8.0 (31/10/2020)
-
-**Highlights**
-
-- Support [OmniSource](https://arxiv.org/abs/2003.13042)
-- Support C3D
-- Support video recognition with audio modality
-- Support HVU
-- Support X3D
-
-**New Features**
-
-- Support AVA dataset preparation ([#266](https://github.com/open-mmlab/mmaction2/pull/266))
-- Support the training of video recognition dataset with multiple tag categories ([#235](https://github.com/open-mmlab/mmaction2/pull/235))
-- Support joint training with multiple training datasets of multiple formats, including images, untrimmed videos, etc. ([#242](https://github.com/open-mmlab/mmaction2/pull/242))
-- Support to specify a start epoch to conduct evaluation ([#216](https://github.com/open-mmlab/mmaction2/pull/216))
-- Implement X3D models, support testing with model weights converted from SlowFast ([#288](https://github.com/open-mmlab/mmaction2/pull/288))
-- Support specify a start epoch to conduct evaluation ([#216](https://github.com/open-mmlab/mmaction2/pull/216))
-
-**Improvements**
-
-- Set default values of 'average_clips' in each config file so that there is no need to set it explicitly during testing in most cases ([#232](https://github.com/open-mmlab/mmaction2/pull/232))
-- Extend HVU datatools to generate individual file list for each tag category ([#258](https://github.com/open-mmlab/mmaction2/pull/258))
-- Support data preparation for Kinetics-600 and Kinetics-700 ([#254](https://github.com/open-mmlab/mmaction2/pull/254))
-- Use `metric_dict` to replace hardcoded arguments in `evaluate` function ([#286](https://github.com/open-mmlab/mmaction2/pull/286))
-- Add `cfg-options` in arguments to override some settings in the used config for convenience ([#212](https://github.com/open-mmlab/mmaction2/pull/212))
-- Rename the old evaluating protocol `mean_average_precision` as `mmit_mean_average_precision` since it is only used on MMIT and is not the `mAP` we usually talk about. Add `mean_average_precision`, which is the real `mAP` ([#235](https://github.com/open-mmlab/mmaction2/pull/235))
-- Add accurate setting (Three crop * 2 clip) and report corresponding performance for TSM model ([#241](https://github.com/open-mmlab/mmaction2/pull/241))
-- Add citations in each preparing_dataset.md in `tools/data/dataset` ([#289](https://github.com/open-mmlab/mmaction2/pull/289))
-- Update the performance of audio-visual fusion on Kinetics-400 ([#281](https://github.com/open-mmlab/mmaction2/pull/281))
-- Support data preparation of OmniSource web datasets, including GoogleImage, InsImage, InsVideo and KineticsRawVideo ([#294](https://github.com/open-mmlab/mmaction2/pull/294))
-- Use `metric_options` dict to provide metric args in `evaluate` ([#286](https://github.com/open-mmlab/mmaction2/pull/286))
-
-**Bug Fixes**
-
-- Register `FrameSelector` in `PIPELINES` ([#268](https://github.com/open-mmlab/mmaction2/pull/268))
-- Fix the potential bug for default value in dataset_setting ([#245](https://github.com/open-mmlab/mmaction2/pull/245))
-- Fix multi-node dist test ([#292](https://github.com/open-mmlab/mmaction2/pull/292))
-- Fix the data preparation bug for `something-something` dataset ([#278](https://github.com/open-mmlab/mmaction2/pull/278))
-- Fix the invalid config url in slowonly README data benchmark ([#249](https://github.com/open-mmlab/mmaction2/pull/249))
-- Validate that the performance of models trained with videos have no significant difference comparing to the performance of models trained with rawframes ([#256](https://github.com/open-mmlab/mmaction2/pull/256))
-- Correct the `img_norm_cfg` used by TSN-3seg-R50 UCF-101 model, improve the Top-1 accuracy by 3% ([#273](https://github.com/open-mmlab/mmaction2/pull/273))
-
-**ModelZoo**
-
-- Add Baselines for Kinetics-600 and Kinetics-700, including TSN-R50-8seg and SlowOnly-R50-8x8 ([#259](https://github.com/open-mmlab/mmaction2/pull/259))
-- Add OmniSource benchmark on MiniKineitcs ([#296](https://github.com/open-mmlab/mmaction2/pull/296))
-- Add Baselines for HVU, including TSN-R18-8seg on 6 tag categories of HVU ([#287](https://github.com/open-mmlab/mmaction2/pull/287))
-- Add X3D models ported from [SlowFast](https://github.com/facebookresearch/SlowFast/) ([#288](https://github.com/open-mmlab/mmaction2/pull/288))
-
-## v0.7.0 (30/9/2020)
-
-**Highlights**
-
-- Support TPN
-- Support JHMDB, UCF101-24, HVU dataset preparation
-- support onnx model conversion
-
-**New Features**
-
-- Support the data pre-processing pipeline for the HVU Dataset ([#277](https://github.com/open-mmlab/mmaction2/pull/227/))
-- Support real-time action recognition from web camera ([#171](https://github.com/open-mmlab/mmaction2/pull/171))
-- Support onnx ([#160](https://github.com/open-mmlab/mmaction2/pull/160))
-- Support UCF101-24 preparation ([#219](https://github.com/open-mmlab/mmaction2/pull/219))
-- Support evaluating mAP for ActivityNet with [CUHK17_activitynet_pred](http://activity-net.org/challenges/2017/evaluation.html) ([#176](https://github.com/open-mmlab/mmaction2/pull/176))
-- Add the data pipeline for ActivityNet, including downloading videos, extracting RGB and Flow frames, finetuning TSN and extracting feature ([#190](https://github.com/open-mmlab/mmaction2/pull/190))
-- Support JHMDB preparation ([#220](https://github.com/open-mmlab/mmaction2/pull/220))
-
-**ModelZoo**
-
-- Add finetuning setting for SlowOnly ([#173](https://github.com/open-mmlab/mmaction2/pull/173))
-- Add TSN and SlowOnly models trained with [OmniSource](https://arxiv.org/abs/2003.13042), which achieve 75.7% Top-1 with TSN-R50-3seg and 80.4% Top-1 with SlowOnly-R101-8x8 ([#215](https://github.com/open-mmlab/mmaction2/pull/215))
-
-**Improvements**
-
-- Support demo with video url ([#165](https://github.com/open-mmlab/mmaction2/pull/165))
-- Support multi-batch when testing ([#184](https://github.com/open-mmlab/mmaction2/pull/184))
-- Add tutorial for adding a new learning rate updater ([#181](https://github.com/open-mmlab/mmaction2/pull/181))
-- Add config name in meta info ([#183](https://github.com/open-mmlab/mmaction2/pull/183))
-- Remove git hash in `__version__` ([#189](https://github.com/open-mmlab/mmaction2/pull/189))
-- Check mmcv version ([#189](https://github.com/open-mmlab/mmaction2/pull/189))
-- Update url with 'https://download.openmmlab.com' ([#208](https://github.com/open-mmlab/mmaction2/pull/208))
-- Update Docker file to support PyTorch 1.6 and update `install.md` ([#209](https://github.com/open-mmlab/mmaction2/pull/209))
-- Polish readsthedocs display ([#217](https://github.com/open-mmlab/mmaction2/pull/217), [#229](https://github.com/open-mmlab/mmaction2/pull/229))
-
-**Bug Fixes**
-
-- Fix the bug when using OpenCV to extract only RGB frames with original shape ([#184](https://github.com/open-mmlab/mmaction2/pull/187))
-- Fix the bug of sthv2 `num_classes` from 339 to 174 ([#174](https://github.com/open-mmlab/mmaction2/pull/174), [#207](https://github.com/open-mmlab/mmaction2/pull/207))
-
-## v0.6.0 (2/9/2020)
-
-**Highlights**
-
-- Support TIN, CSN, SSN, NonLocal
-- Support FP16 training
-
-**New Features**
-
-- Support NonLocal module and provide ckpt in TSM and I3D ([#41](https://github.com/open-mmlab/mmaction2/pull/41))
-- Support SSN ([#33](https://github.com/open-mmlab/mmaction2/pull/33), [#37](https://github.com/open-mmlab/mmaction2/pull/37), [#52](https://github.com/open-mmlab/mmaction2/pull/52), [#55](https://github.com/open-mmlab/mmaction2/pull/55))
-- Support CSN ([#87](https://github.com/open-mmlab/mmaction2/pull/87))
-- Support TIN ([#53](https://github.com/open-mmlab/mmaction2/pull/53))
-- Support HMDB51 dataset preparation ([#60](https://github.com/open-mmlab/mmaction2/pull/60))
-- Support encoding videos from frames ([#84](https://github.com/open-mmlab/mmaction2/pull/84))
-- Support FP16 training ([#25](https://github.com/open-mmlab/mmaction2/pull/25))
-- Enhance demo by supporting rawframe inference ([#59](https://github.com/open-mmlab/mmaction2/pull/59)), output video/gif ([#72](https://github.com/open-mmlab/mmaction2/pull/72))
-
-**ModelZoo**
-
-- Update Slowfast modelzoo ([#51](https://github.com/open-mmlab/mmaction2/pull/51))
-- Update TSN, TSM video checkpoints ([#50](https://github.com/open-mmlab/mmaction2/pull/50))
-- Add data benchmark for TSN ([#57](https://github.com/open-mmlab/mmaction2/pull/57))
-- Add data benchmark for SlowOnly ([#77](https://github.com/open-mmlab/mmaction2/pull/77))
-- Add BSN/BMN performance results with feature extracted by our codebase ([#99](https://github.com/open-mmlab/mmaction2/pull/99))
-
-**Improvements**
-
-- Polish data preparation codes ([#70](https://github.com/open-mmlab/mmaction2/pull/70))
-- Improve data preparation scripts ([#58](https://github.com/open-mmlab/mmaction2/pull/58))
-- Improve unittest coverage and minor fix ([#62](https://github.com/open-mmlab/mmaction2/pull/62))
-- Support PyTorch 1.6 in CI ([#117](https://github.com/open-mmlab/mmaction2/pull/117))
-- Support `with_offset` for rawframe dataset ([#48](https://github.com/open-mmlab/mmaction2/pull/48))
-- Support json annotation files ([#119](https://github.com/open-mmlab/mmaction2/pull/119))
-- Support `multi-class` in TSMHead ([#104](https://github.com/open-mmlab/mmaction2/pull/104))
-- Support using `val_step()` to validate data for each `val` workflow ([#123](https://github.com/open-mmlab/mmaction2/pull/123))
-- Use `xxInit()` method to get `total_frames` and make `total_frames` a required key ([#90](https://github.com/open-mmlab/mmaction2/pull/90))
-- Add paper introduction in model readme ([#140](https://github.com/open-mmlab/mmaction2/pull/140))
-- Adjust the directory structure of `tools/` and rename some scripts files ([#142](https://github.com/open-mmlab/mmaction2/pull/142))
-
-**Bug Fixes**
-
-- Fix configs for localization test ([#67](https://github.com/open-mmlab/mmaction2/pull/67))
-- Fix configs of SlowOnly by fixing lr to 8 gpus ([#136](https://github.com/open-mmlab/mmaction2/pull/136))
-- Fix the bug in analyze_log ([#54](https://github.com/open-mmlab/mmaction2/pull/54))
-- Fix the bug of generating HMDB51 class index file ([#69](https://github.com/open-mmlab/mmaction2/pull/69))
-- Fix the bug of using `load_checkpoint()` in ResNet ([#93](https://github.com/open-mmlab/mmaction2/pull/93))
-- Fix the bug of `--work-dir` when using slurm training script ([#110](https://github.com/open-mmlab/mmaction2/pull/110))
-- Correct the sthv1/sthv2 rawframes filelist generate command ([#71](https://github.com/open-mmlab/mmaction2/pull/71))
-- `CosineAnnealing` typo ([#47](https://github.com/open-mmlab/mmaction2/pull/47))
-
-## v0.5.0 (9/7/2020)
-
-**Highlights**
-
-- MMAction2 is released
-
-**New Features**
-
-- Support various datasets: UCF101, Kinetics-400, Something-Something V1&V2, Moments in Time,
-  Multi-Moments in Time, THUMOS14
-- Support various action recognition methods: TSN, TSM, R(2+1)D, I3D, SlowOnly, SlowFast, Non-local
-- Support various action localization methods: BSN, BMN
-- Colab demo for action recognition
diff --git a/docs/zh_cn/notes/contribution_guide.md b/docs/zh_cn/notes/contribution_guide.md
deleted file mode 100644
index 07dbbd105c..0000000000
--- a/docs/zh_cn/notes/contribution_guide.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# 参与贡献 MMAction2
-
-欢迎任何类型的贡献，包括但不限于
-
-- 修改拼写错误或代码错误
-- 添加文档或将文档翻译成其他语言
-- 添加新功能和新组件
-
-## 工作流程
-
-1. fork 并 pull 最新的 OpenMMLab 仓库 (MMAction2)
-2. 签出到一个新分支（不要使用 master 分支提交 PR）
-3. 进行修改并提交至 fork 出的自己的远程仓库
-4. 在我们的仓库中创建一个 PR
-
-```{note}
-如果你计划添加一些新的功能，并引入大量改动，请尽量首先创建一个 issue 来进行讨论。
-如果你是论文作者，希望在 MMAction2 中支持你的算法，请联系我们。 我们十分感谢你的贡献。
-```
-
-## 代码风格
-
-### Python
-
-我们采用 [PEP8](https://www.python.org/dev/peps/pep-0008/) 作为统一的代码风格。
-
-我们使用下列工具来进行代码风格检查与格式化：
-
-- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装
-- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具
-- [yapf](https://github.com/google/yapf): 一个 Python 文件的格式化工具。
-- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误
-- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具
-- [docformatter](https://github.com/myint/docformatter): 一个 docstring 格式化工具。
-
-yapf 和 isort 的格式设置位于 [setup.cfg](../../../setup.cfg)
-
-我们使用 [pre-commit hook](https://pre-commit.com/) 来保证每次提交时自动进行代
-码检查和格式化，启用的功能包括 `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, 修复 `end-of-files`, `double-quoted-strings`,
-`python-encoding-pragma`, `mixed-line-ending`, 对 `requirments.txt`的排序等。
-pre-commit hook 的配置文件位于 [.pre-commit-config](../../../.pre-commit-config.yaml)
-
-在你克隆仓库后，你需要按照如下步骤安装并初始化 pre-commit hook。
-
-```shell
-pip install -U pre-commit
-```
-
-在仓库文件夹中执行
-
-```shell
-pre-commit install
-```
-
-在此之后，每次提交，代码规范检查和格式化工具都将被强制执行。
-
-```{important}
-在创建 PR 之前，请确保你的代码完成了代码规范检查，并经过了 yapf 的格式化。
-```
-
-### C++ 和 CUDA
-
-C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)
diff --git a/docs/zh_cn/notes/ecosystem.md b/docs/zh_cn/notes/ecosystem.md
new file mode 100644
index 0000000000..2aafe731c8
--- /dev/null
+++ b/docs/zh_cn/notes/ecosystem.md
@@ -0,0 +1,23 @@
+# 基于 MMAction2 的生态项目
+
+有许多研究工作和项目是基于 MMAction2 构建的。
+我们列举了一些例子，展示了如何扩展 MMAction2 来适用于您自己的项目。
+由于页面可能尚未完成，所以请随时通过提交PR来更新此页面。
+
+## 作为扩展的项目
+
+- [OTEAction2](https://github.com/openvinotoolkit/mmaction2)：用于动作识别的 OpenVINO 训练扩展。
+- [PYSKL](https://github.com/kennymckormick/pyskl)：一个专注于基于骨骼点动作识别的工具箱。
+
+## 论文相关的项目
+
+还有一些与论文一起发布的项目。
+其中一些论文发表在顶级会议（CVPR、ICCV 和 ECCV）上，其他一些也具有很高的影响力。
+我们按照会议时间列出它们，方便社区参考。
+
+- Video Swin Transformer，CVPR 2022 [\[论文\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
+- Evidential Deep Learning for Open Set Action Recognition，ICCV 2021 Oral [\[论文\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR)
+- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective，ICCV 2021 Oral [\[论文\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS)
+- MGSampler: An Explainable Sampling Strategy for Video Action Recognition，ICCV 2021 [\[论文\]](https://arxiv.org/abs/2104.09952)[\[github\]](https://github.com/MCG-NJU/MGSampler)
+- MultiSports: A Multi-Person Video Dataset of Spatio-Temporally Localized Sports Actions，ICCV 2021 [\[论文\]](https://arxiv.org/abs/2105.07404)
+- Long Short-Term Transformer for Online Action Detection，NeurIPS 2021 [\[论文\]](https://arxiv.org/abs/2107.03377)[\[github\]](https://github.com/amazon-research/long-short-term-transformer)
diff --git a/docs/zh_cn/notes/faq.md b/docs/zh_cn/notes/faq.md
deleted file mode 100644
index 85d30ff06c..0000000000
--- a/docs/zh_cn/notes/faq.md
+++ /dev/null
@@ -1 +0,0 @@
-# 常见问题（内容建设中）
diff --git a/docs/zh_cn/notes/projects.md b/docs/zh_cn/notes/projects.md
deleted file mode 100644
index 6734f69804..0000000000
--- a/docs/zh_cn/notes/projects.md
+++ /dev/null
@@ -1 +0,0 @@
-# 基于 MMAction2 的项目列表（内容建设中）
diff --git a/docs/zh_cn/notes/pytorch2.0.md b/docs/zh_cn/notes/pytorch2.0.md
new file mode 100644
index 0000000000..d50101490b
--- /dev/null
+++ b/docs/zh_cn/notes/pytorch2.0.md
@@ -0,0 +1,21 @@
+# PyTorch 2.0 Compatibility and Benchmark
+
+PyTorch introduced `torch.compile` in its 2.0 release. It compiles your model to speedup trainning & validation. We provide a benchmark result and compatibility of typical models in MMAction2. Except for one model (MViT) that fails to compile, the performance of other models remains consistent before and after compilation.
+
+| Config                                                                    | compiled | Train time / iter (s) | GPU memory (M) | test metric  |
+| ------------------------------------------------------------------------- | -------- | --------------------- | -------------- | ------------ |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb                    | False    | 0.50                  | 42537          | 36.55        |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb                    | True     | 0.61                  | 53149          | 36.72        |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb                         | False    | 0.688                 | 14263          | 77.69        |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb                         | True     | 0.691                 | 13863          | 77.57        |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d                          | False    | 0.0305                | 1184           | 91.69        |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d                          | True     | 0.0298                | 1273           | 91.64        |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint                           | False    | 0.498                 | 9581           | 93.6         |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint                           | True     | 0.505                 | 11968          | 93.49        |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb            | False    | 0.17                  | 8278           | 20.76        |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb            | True     | 0.1835                | 12004          | 21.67        |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb          | False    | 0.323                 | 21651          | 78.90        |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb          | True     | 0.262                 | 20905          | 78.70        |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | False    | 0.098                 | 5777           | 75.12        |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | True     | 0.0942                | 7095           | 75.15        |
+| mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb                        | Fail     | incompatible          | incompatible   | incompatible |
diff --git a/docs/zh_cn/project_zoo.py b/docs/zh_cn/project_zoo.py
new file mode 100644
index 0000000000..1b59d1abb7
--- /dev/null
+++ b/docs/zh_cn/project_zoo.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+from pathlib import Path
+
+from utils import replace_link
+
+# This script reads /projects/*/README.md and generate projectzoo.md
+
+all_files = list(Path('../../projects/').glob('*/README.md'))
+example_project = '../../projects/example_project/README.md'
+all_files.remove(Path(example_project))
+all_files.insert(0, Path(example_project))
+
+project_zoo = open('../../projects/README.md').read()
+for file in all_files:
+    chinese_readme = Path(str(file).replace('README.md', 'README_zh-CN.md'))
+    if chinese_readme.exists():
+        file = chinese_readme
+    with open(file) as f:
+        content = f.read()
+        content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                               file)
+        content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                               file)
+
+        project_zoo += content
+
+with open('projectzoo.md', 'w') as f:
+    f.write(project_zoo)
diff --git a/docs/zh_cn/stat.py b/docs/zh_cn/stat.py
index 166f7b32bd..0860cf6bbf 100644
--- a/docs/zh_cn/stat.py
+++ b/docs/zh_cn/stat.py
@@ -1,174 +1,268 @@
 #!/usr/bin/env python
-# Copyright (c) OpenMMLab. All rights reserved.
-import functools as func
-import glob
 import re
-from os.path import basename, splitext
-
-import numpy as np
-import titlecase
-
-
-def anchor(name):
-    return re.sub(r'-+', '-', re.sub(r'[^a-zA-Z0-9]', '-',
-                                     name.strip().lower())).strip('-')
-
-
-# Count algorithms
-
-files = sorted(glob.glob('*_models.md'))
-# files = sorted(glob.glob('docs/*_models.md'))
-
-stats = []
-
-for f in files:
-    with open(f, 'r') as content_file:
-        content = content_file.read()
-
-    # title
-    title = content.split('\n')[0].replace('#', '')
-
-    # skip IMAGE and ABSTRACT tags
-    content = [
-        x for x in content.split('\n')
-        if 'IMAGE' not in x and 'ABSTRACT' not in x
-    ]
-    content = '\n'.join(content)
-
-    # count papers
-    papers = set(
-        (papertype, titlecase.titlecase(paper.lower().strip()))
-        for (papertype, paper) in re.findall(
-            r'<!--\s*\[([A-Z]*?)\]\s*-->\s*\n.*?\btitle\s*=\s*{(.*?)}',
-            content, re.DOTALL))
-    # paper links
-    revcontent = '\n'.join(list(reversed(content.splitlines())))
-    paperlinks = {}
-    for _, p in papers:
-        print(p)
-        q = p.replace('\\', '\\\\').replace('?', '\\?')
-        paperlinks[p] = ' '.join(
-            (f'[->]({splitext(basename(f))[0]}.html#{anchor(paperlink)})'
-             for paperlink in re.findall(
-                 rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
-                 revcontent, re.DOTALL | re.IGNORECASE)))
-        print('   ', paperlinks[p])
-    paperlist = '\n'.join(
-        sorted(f'    - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
-    # count configs
-    configs = set(x.lower().strip()
-                  for x in re.findall(r'https.*configs/.*\.py', content))
-
-    # count ckpts
-    ckpts = set(x.lower().strip()
-                for x in re.findall(r'https://download.*\.pth', content)
-                if 'mmaction' in x)
-
-    statsmsg = f"""
-## [{title}]({f})
-
-* 模型权重文件数量: {len(ckpts)}
-* 配置文件数量: {len(configs)}
-* 论文数量: {len(papers)}
-{paperlist}
-
-    """
-
-    stats.append((papers, configs, ckpts, statsmsg))
-
-allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _, _ in stats])
-allconfigs = func.reduce(lambda a, b: a.union(b), [c for _, c, _, _ in stats])
-allckpts = func.reduce(lambda a, b: a.union(b), [c for _, _, c, _ in stats])
-msglist = '\n'.join(x for _, _, _, x in stats)
-
-papertypes, papercounts = np.unique([t for t, _ in allpapers],
-                                    return_counts=True)
-countstr = '\n'.join(
-    [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
-
-modelzoo = f"""
+import shutil
+from collections import defaultdict
+from pathlib import Path
+
+from modelindex.load_model_index import load
+from modelindex.models.Result import Result
+from tabulate import tabulate
+from utils import replace_link
+
+MMACT_ROOT = Path(__file__).absolute().parents[2]
+PAPERS_ROOT = Path('model_zoo')  # Path to save generated paper pages.
+GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/'
+MODELZOO_TEMPLATE = """\
 # 模型库统计
 
-* 模型权重文件数量: {len(allckpts)}
-* 配置文件数量: {len(allconfigs)}
-* 论文数量: {len(allpapers)}
-{countstr}
-
-For supported datasets, see [datasets overview](datasets.md).
-
-{msglist}
-"""
-
-with open('modelzoo.md', 'w') as f:
-    f.write(modelzoo)
-
-# # Count datasets
-#
-# files = ['supported_datasets.md']
-# # files = sorted(glob.glob('docs/tasks/*.md'))
-#
-# datastats = []
-#
-# for f in files:
-#     with open(f, 'r') as content_file:
-#         content = content_file.read()
-#
-#     # title
-#     title = content.split('\n')[0].replace('#', '')
-#
-#     # count papers
-#     papers = set(
-#         (papertype, titlecase.titlecase(paper.lower().strip()))
-#         for (papertype, paper) in re.findall(
-#             r'<!--\s*\[([A-Z]*?)\]\s*-->\s*\n.*?\btitle\s*=\s*{(.*?)}',
-#             content, re.DOTALL))
-#     # paper links
-#     revcontent = '\n'.join(list(reversed(content.splitlines())))
-#     paperlinks = {}
-#     for _, p in papers:
-#         print(p)
-#         q = p.replace('\\', '\\\\').replace('?', '\\?')
-#         paperlinks[p] = ', '.join(
-#             (f'[{p.strip()} ->]({splitext(basename(f))[0]}.html#{anchor(p)})'
-#              for p in re.findall(
-#                  rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
-#                  revcontent, re.DOTALL | re.IGNORECASE)))
-#         print('   ', paperlinks[p])
-#     paperlist = '\n'.join(
-#         sorted(f'    - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
-#
-#     statsmsg = f"""
-# ## [{title}]({f})
-#
-# * Number of papers: {len(papers)}
-# {paperlist}
-#
-#     """
-#
-#     datastats.append((papers, configs, ckpts, statsmsg))
-#
-# alldatapapers = func.reduce(lambda a, b: a.union(b),
-#                             [p for p, _, _, _ in datastats])
-#
-# # Summarize
-#
-# msglist = '\n'.join(x for _, _, _, x in stats)
-# datamsglist = '\n'.join(x for _, _, _, x in datastats)
-# papertypes, papercounts = np.unique([t for t, _ in alldatapapers],
-#                                     return_counts=True)
-# countstr = '\n'.join(
-#     [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
-#
-# modelzoo = f"""
-# # Overview
-#
-# * Number of papers: {len(alldatapapers)}
-# {countstr}
-#
-# For supported action algorithms, see [modelzoo overview](modelzoo.md).
-#
-# {datamsglist}
-# """
-#
-# with open('datasets.md', 'w') as f:
-#     f.write(modelzoo)
+在本页面中，我们列举了我们支持的[所有算法](#所有已支持的算法)。你可以点击链接跳转至对应的模型详情页面。
+
+另外，我们还列出了我们提供的所有模型权重文件。你可以使用排序和搜索功能找到需要的模型权重，并使用链接跳转至模型详情页面。
+
+## 所有已支持的算法
+
+* 论文数量：{num_papers}
+{type_msg}
+
+* 模型权重文件数量：{num_ckpts}
+{paper_msg}
+
+"""  # noqa: E501
+
+METRIC_ALIAS = {
+    'Top 1 Accuracy': 'Top-1 (%)',
+    'Top 5 Accuracy': 'Top-5 (%)',
+}
+
+TASK_MAP = dict(
+    detection='时空行为检测模型',
+    localization='时序动作定位模型',
+    recognition='行为识别模型',
+    skeleton='基于骨骼点的行为识别模型',
+    retrieval='视频检索模型',
+    recognition_audio='基于声音的行为识别模型')
+
+model_index = load(str(MMACT_ROOT / 'model-index.yml'))
+
+
+def build_collections(model_index):
+    # add models for collections
+    col_by_name = {}
+    for col in model_index.collections:
+        setattr(col, 'models', [])
+        col_by_name[col.name] = col
+
+    for model in model_index.models:
+        col = col_by_name[model.in_collection]
+        col.models.append(model)
+        setattr(model, 'collection', col)
+        if model.results is None:
+            setattr(model, 'tasks', [])
+        else:
+            setattr(model, 'tasks', [result.task for result in model.results])
+
+
+build_collections(model_index)
+
+# save a map from model name to title in README
+model2title = dict()
+
+
+def count_papers(collections):
+    total_num_ckpts = 0
+    type_count = defaultdict(int)
+    paper_msgs = []
+
+    for collection in collections:
+        with open(MMACT_ROOT / collection.readme) as f:
+            readme = f.read()
+
+        ckpts = set(x.lower().strip()
+                    for x in re.findall(r'\[ckpt.*\]\((https?.*)\)', readme))
+        total_num_ckpts += len(ckpts)
+        title = collection.paper['Title']
+        papertype = collection.data.get('type', 'Algorithm')
+        type_count[papertype] += 1
+
+        readme_title = re.search(r'^#\s+.+', readme)
+
+        readme = Path(collection.filepath).parents[1].with_suffix('.md').name
+        model = Path(collection.filepath).parent.name
+        model2title[model] = readme_title.group()[2:].replace(' ', '-')
+        paper_msgs.append(f'\t- [{papertype}] [{title}]({PAPERS_ROOT / readme}'
+                          f'#{model2title[model]}) ({len(ckpts)} ckpts)')
+
+    type_msg = '\n'.join(
+        [f'\t- {type_}: {count}' for type_, count in type_count.items()])
+    paper_msg = '\n'.join(paper_msgs)
+
+    modelzoo = MODELZOO_TEMPLATE.format(
+        num_papers=len(collections),
+        num_ckpts=total_num_ckpts,
+        type_msg=type_msg,
+        paper_msg=paper_msg,
+    )
+
+    with open('modelzoo_statistics.md', 'w') as f:
+        f.write(modelzoo)
+
+
+count_papers(model_index.collections)
+
+
+def generate_paper_page(collection):
+
+    # Write a copy of README
+    with open(MMACT_ROOT / collection.readme) as f:
+        content = f.read()
+    readme_path = Path(collection.filepath)
+    copy = PAPERS_ROOT / readme_path.parents[1].with_suffix('.md').name
+    if not copy.exists():
+        with open(copy, 'w') as copy_file:
+            task = readme_path.parents[1].name
+            head_content = f'# {TASK_MAP[task]}\n'
+            copy_file.write(head_content)
+
+    def lower_heading(match):
+        return '#' + match.group()
+
+    content = replace_link(r'\[([^\]]+)\]\(([^)]+)\)', '[{}]({})', content,
+                           Path(collection.readme))
+    content = replace_link(r'\[([^\]]+)\]: (.*)', '[{}]: {}', content,
+                           Path(collection.readme))
+
+    content = re.sub(r'^#+\s+.+', lower_heading, content, flags=re.M)
+
+    with open(copy, 'a') as copy_file:
+        copy_file.write(content)
+
+
+if PAPERS_ROOT.exists():
+    shutil.rmtree(PAPERS_ROOT)
+PAPERS_ROOT.mkdir(exist_ok=True)
+for collection in model_index.collections:
+    generate_paper_page(collection)
+
+
+def scatter_results(models):
+    model_result_pairs = []
+    for model in models:
+        if model.results is None:
+            result = Result(task=None, dataset=None, metrics={})
+            model_result_pairs.append((model, result))
+        else:
+            for result in model.results:
+                model_result_pairs.append((model, result))
+    return model_result_pairs
+
+
+def generate_summary_table(task, model_result_pairs, title=None):
+    metrics = set()
+    for model, result in model_result_pairs:
+        if result.task == task:
+            metrics = metrics.union(result.metrics.keys())
+    metrics = sorted(list(metrics))
+
+    rows = []
+
+    def convert2float(number):
+        units = {'M': 1e6, 'G': 1e9, 'T': 1e12}
+        if isinstance(number, str):
+            num = float(number.rstrip('MGT'))
+            number = num * units[number[-1]]
+        return number
+
+    for model, result in model_result_pairs:
+        if result.task != task:
+            continue
+        name = model.name
+        if model.metadata.parameters is not None:
+            params = convert2float(model.metadata.parameters)
+            params = f'{params / 1e6:.2f}'  # Params
+        else:
+            params = None
+        if model.metadata.flops is not None:
+            flops = convert2float(model.metadata.flops)
+            flops = f'{flops / 1e9:.2f}'  # Flops
+        else:
+            flops = None
+
+        readme = Path(
+            model.collection.filepath).parents[1].with_suffix('.md').name
+        model = Path(model.collection.filepath).parent.name
+        page = f'[链接]({PAPERS_ROOT / readme}#{model2title[model]})'
+        model_metrics = []
+        for metric in metrics:
+            model_metrics.append(str(result.metrics.get(metric, '')))
+
+        rows.append([name, params, flops, *model_metrics, page])
+
+    with open('modelzoo_statistics.md', 'a') as f:
+        if title is not None:
+            f.write(f'\n{title}')
+        f.write("""\n```{table}\n:class: model-summary\n""")
+        header = [
+            '模型',
+            '参数量 (M)',
+            'Flops (G)',
+            *[METRIC_ALIAS.get(metric, metric) for metric in metrics],
+            'Readme',
+        ]
+        table_cfg = dict(
+            tablefmt='pipe',
+            floatfmt='.2f',
+            numalign='right',
+            stralign='center')
+        f.write(tabulate(rows, header, **table_cfg))
+        f.write('\n```\n')
+
+
+def generate_dataset_wise_table(task, model_result_pairs, title=None):
+    dataset_rows = defaultdict(list)
+    for model, result in model_result_pairs:
+        if result.task == task:
+            dataset_rows[result.dataset].append((model, result))
+
+    if title is not None:
+        with open('modelzoo_statistics.md', 'a') as f:
+            f.write(f'\n{title}')
+    for dataset, pairs in dataset_rows.items():
+        generate_summary_table(task, pairs, title=f'### {dataset}')
+
+
+model_result_pairs = scatter_results(model_index.models)
+
+# Generate Action Recognition Summary
+generate_dataset_wise_table(
+    task='Action Recognition',
+    model_result_pairs=model_result_pairs,
+    title='## 行为识别',
+)
+
+# Generate Action Detection Summary
+generate_dataset_wise_table(
+    task='Action Detection',
+    model_result_pairs=model_result_pairs,
+    title='## 时空行为检测',
+)
+
+# Generate Skeleton-based Action Recognition Summary
+generate_dataset_wise_table(
+    task='Skeleton-based Action Recognition',
+    model_result_pairs=model_result_pairs,
+    title='## 骨骼点行为识别',
+)
+
+# Generate Video Retrieval Summary
+generate_dataset_wise_table(
+    task='Video Retrieval',
+    model_result_pairs=model_result_pairs,
+    title='## 视频检索',
+)
+
+# Generate Temporal Action Localization Summary
+generate_dataset_wise_table(
+    task='Temporal Action Localization',
+    model_result_pairs=model_result_pairs,
+    title='## 时序动作定位',
+)
diff --git a/docs/zh_cn/switch_language.md b/docs/zh_cn/switch_language.md
index 80cf0dc571..4bade2237f 100644
--- a/docs/zh_cn/switch_language.md
+++ b/docs/zh_cn/switch_language.md
@@ -1,3 +1,3 @@
 ## <a href='https://mmaction2.readthedocs.io/en/latest/'>English</a>
 
-## <a href='https://mmaction2.readthedocs.io/zh_CN/1.x/'>简体中文</a>
+## <a href='https://mmaction2.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/docs/zh_cn/useful_tools.md b/docs/zh_cn/useful_tools.md
new file mode 100644
index 0000000000..6860428940
--- /dev/null
+++ b/docs/zh_cn/useful_tools.md
@@ -0,0 +1,91 @@
+# 分析工具
+
+除了训练/测试脚本外，我们在 `tools/` 目录下还提供了许多有用的工具。
+
+## 分析工具链接
+
+<!-- TOC -->
+
+- [](#分析工具)
+  - [分析工具](#分析工具)
+  - [模型转换](#模型转换)
+    - [准备模型进行发布](#准备模型进行发布)
+  - [杂项](#杂项)
+    - [评估指标](#评估指标)
+    - [打印完整配置](#打印完整配置)
+    - [检查视频](#检查视频)
+    - [多流融合](#多流融合)
+
+<!-- TOC -->
+
+## 模型转换
+
+### 准备模型进行发布
+
+`tools/deployment/publish_model.py` 帮助用户准备他们的模型进行发布。
+
+在将模型上传到 AWS 之前，您可能想要：
+
+（1）将模型权重转换为 CPU 张量。
+（2）删除优化器状态信息。
+（3）计算权重文件的哈希值，并将哈希值添加到文件名中。
+
+```shell
+python tools/deployment/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
+```
+
+例如，
+
+```shell
+python tools/deployment/publish_model.py work_dirs/tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb/latest.pth tsn_r50_1x1x3_100e_kinetics400_rgb.pth
+```
+
+最终输出的文件名将是 `tsn_r50_8xb32-1x1x3-100e_kinetics400-rgb-{hash id}.pth`。
+
+## 杂项
+
+### 评估指标
+
+`tools/analysis_tools/eval_metric.py` 根据配置文件评估保存在文件中的结果的某些指标。
+
+保存的结果文件是通过在 `tools/test.py` 中设置参数 `--out ${RESULT_FILE}` 来创建的，以指示结果文件，其中存储了整个模型的最终输出。
+
+```shell
+python tools/analysis/eval_metric.py ${CONFIG_FILE} ${RESULT_FILE} [--eval ${EVAL_METRICS}] [--cfg-options ${CFG_OPTIONS}] [--eval-options ${EVAL_OPTIONS}]
+```
+
+### 打印完整配置
+
+`tools/analysis_tools/print_config.py` 逐字打印整个配置，展开所有导入项。
+
+```shell
+python tools/analysis_tools/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}]
+```
+
+### 检查视频
+
+`tools/analysis_tools/check_videos.py` 使用指定的视频编码器迭代由输入配置文件指定的所有样本，查找无效的视频（损坏或缺失），并将相应的文件路径保存到输出文件中。请注意，删除无效视频后，用户需要重新生成视频文件列表。
+
+```shell
+python tools/analysis_tools/check_videos.py ${CONFIG} [-h] [--options OPTIONS [OPTIONS ...]] [--cfg-options CFG_OPTIONS [CFG_OPTIONS ...]] [--output-file OUTPUT_FILE] [--split SPLIT] [--decoder DECODER] [--num-processes NUM_PROCESSES] [--remove-corrupted-videos]
+```
+
+### 多流融合
+
+`tools/analysis_tools/report_accuracy.py` 使用推理保存的结果（在测试时设置 `--dump res.pkl`）来融合多流预测分数，即后融合（late fusion）。
+
+```shell
+python tools/analysis_tools/report_accuracy.py [--preds ${RESULT_PKL_1 [RESULT_PKL_2 ...]}] [--coefficients ${COEFFICIENT_1 [COEFFICIENT_2, ...]}] [--apply-softmax]
+```
+
+以 joint-bone 融合为例，这是基于骨骼动作识别任务的一种常见实践。
+
+```shell
+python tools/analysis_tools/report_accuracy.py --preds demo/fuse/joint.pkl demo/fuse/bone.pkl --coefficients 1.0 1.0
+```
+
+```
+Mean Class Accuracy: 0.9180
+Top 1 Accuracy: 0.9333
+Top 5 Accuracy: 0.9833
+```
diff --git a/docs/zh_cn/user_guides/1_config.md b/docs/zh_cn/user_guides/1_config.md
deleted file mode 100644
index 4c897a8bea..0000000000
--- a/docs/zh_cn/user_guides/1_config.md
+++ /dev/null
@@ -1,707 +0,0 @@
-# 教程 1：如何编写配置文件
-
-MMAction2 使用 python 文件作为配置文件。其配置文件系统的设计将模块化与继承整合进来，方便用户进行各种实验。
-MMAction2 提供的所有配置文件都放置在 `$MMAction2/configs` 文件夹下，用户可以通过运行命令
-`python tools/analysis_tools/print_config.py /PATH/TO/CONFIG` 来查看完整的配置信息，从而方便检查所对应的配置文件。
-
-<!-- TOC -->
-
-- [通过命令行参数修改配置信息](#通过命令行参数修改配置信息)
-- [配置文件结构](#配置文件结构)
-- [配置文件命名规则](#配置文件命名规则)
-  - [动作识别的配置文件系统](#动作识别的配置文件系统)
-  - [时空动作检测的配置文件系统](#时空动作检测的配置文件系统)
-  - [时序动作检测的配置文件系统](#时序动作检测的配置文件系统)
-
-<!-- TOC -->
-
-## 通过命令行参数修改配置信息
-
-当用户使用脚本 "tools/train.py" 或者 "tools/test.py" 提交任务时，可以通过指定 `--cfg-options` 参数来直接修改所使用的配置文件内容。
-
-- 更新配置文件内的字典
-
-  用户可以按照原始配置中的字典键顺序来指定配置文件的设置。
-  例如，`--cfg-options model.backbone.norm_eval=False` 会改变 `train` 模式下模型主干网络 backbone 中所有的 BN 模块。
-
-- 更新配置文件内列表的键
-
-  配置文件中，存在一些由字典组成的列表。例如，训练数据前处理流水线 data.train.pipeline 就是 python 列表。
-  如，`[dict(type='SampleFrames'), ...]`。如果用户想更改其中的 `'SampleFrames'` 为 `'DenseSampleFrames'`，
-  可以指定 `--cfg-options data.train.pipeline.0.type=DenseSampleFrames`。
-
-- 更新列表/元组的值。
-
-  当配置文件中需要更新的是一个列表或者元组，例如，配置文件通常会设置 `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`，用户如果想更改，
-  需要指定 `--cfg-options model.data_preprocessor.mean="[128,128,128]"`。注意这里的引号 " 对于列表/元组数据类型的修改是必要的。
-
-## 配置文件结构
-
-在 `config/_base_` 文件夹下存在 3 种基本组件类型： 模型（model）, 训练策略（schedule）, 运行时的默认设置（default_runtime）。
-许多方法都可以方便地通过组合这些组件进行实现，如 TSN，I3D，SlowOnly 等。
-其中，通过 `_base_` 下组件来构建的配置被称为 _原始配置_（_primitive_）。
-
-对于在同一文件夹下的所有配置文件，MMAction2 推荐只存在 **一个** 对应的 _原始配置_ 文件。
-所有其他的配置文件都应该继承 _原始配置_ 文件，这样就能保证配置文件的最大继承深度为 3。
-
-为了方便理解，MMAction2 推荐用户继承现有方法的配置文件。
-例如，如需修改 TSN 的配置文件，用户应先通过 `_base_ = '../tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'` 继承 TSN 配置文件的基本结构，
-并修改其中必要的内容以完成继承。
-
-如果用户想实现一个独立于任何一个现有的方法结构的新方法，则可以在 `configs/TASK` 中建立新的文件夹。
-
-更多详细内容，请参考 [mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/config.html)。
-
-## 配置文件命名规则
-
-MMAction2 按照以下风格进行配置文件命名，代码库的贡献者需要遵循相同的命名规则。配置文件名分为几个部分。逻辑上，不同的部分用下划线 `'_'`连接，同一部分中的设置用破折号 `'-'`连接。
-
-```
-{algorithm info}_{module info}_{training info}_{data info}.py
-```
-
-其中，`{xxx}` 表示必要的命名域，`[yyy]` 表示可选的命名域。
-
-- `{algorithm info}`:
-  - `{model}`: 模型类型，如 `tsn`，`i3d`, `swin`, `vit` 等。
-  - `[model setting]`: 一些模型上的特殊设置,如`base`, `p16`, `w877`等。
-- `{module info}`:
-  - `[pretained info]`: 预训练信息,如 `kinetics400-pretrained`， `in1k-pre`等.
-  - `{backbone}`: 主干网络类型和预训练信息，如 `r50`（ResNet-50）等。
-  - `[backbone setting]`: 对于一些骨干网络的特殊设置，如`nl-dot-product`, `bnfrozen`, `nopool`等。
-- `{training info}`:
-  - `{gpu x batch_per_gpu]}`: GPU 数量以及每个 GPU 上的采样。
-  - `{pipeline setting}`: 采帧数据格式，形如 `dense`, `{clip_len}x{frame_interval}x{num_clips}`, `u48`等。
-  - `{schedule}`: 训练策略设置，如 `20e` 表示 20 个周期（epoch）。
-- `{data info}`:
-  - `{dataset}`:数据集名，如 `kinetics400`，`mmit`等。
-  - `{modality}`: 帧的模态，如 `rgb`, `flow`, `keypoint-2d`等。
-
-### 动作识别的配置文件系统
-
-MMAction2 将模块化设计整合到配置文件系统中，以便执行各类不同实验。
-
-- 以 TSN 为例
-
-  为了帮助用户理解 MMAction2 的配置文件结构，以及动作识别系统中的一些模块，这里以 TSN 为例，给出其配置文件的注释。
-  对于每个模块的详细用法以及对应参数的选择，请参照 API 文档。
-
-  ```python
-  # 模型设置
-  model = dict(  # 模型的配置
-      type='Recognizer2D',  # 动作识别器的类型
-      backbone=dict(  # Backbone 字典设置
-          type='ResNet',  # Backbone 名
-          pretrained='torchvision://resnet50',  # 预训练模型的 url 或文件位置
-          depth=50,  # ResNet 模型深度
-          norm_eval=False),  # 训练时是否设置 BN 层为验证模式
-      cls_head=dict(  # 分类器字典设置
-          type='TSNHead',  # 分类器名
-          num_classes=400,  # 分类类别数量
-          in_channels=2048,  # 分类器里输入通道数
-          spatial_type='avg',  # 空间维度的池化种类
-          consensus=dict(type='AvgConsensus', dim=1),  # consensus 模块设置
-          dropout_ratio=0.4,  # dropout 层概率
-          init_std=0.01,  # 线性层初始化 std 值
-          average_clips='prob'),  # 平均多个 clip 结果的方法
-      data_preprocessor=dict(  # 数据预处理器的字典设置
-          type='ActionDataPreprocessor',  # 数据预处理器名
-          mean=[123.675, 116.28, 103.53],  # 不同通道归一化所用的平均值
-          std=[58.395, 57.12, 57.375],  # 不同通道归一化所用的方差
-          format_shape='NCHW'),  # 最终图像形状格式
-      # 模型训练和测试的设置
-      train_cfg=None,  # 训练 TSN 的超参配置
-      test_cfg=None)  # 测试 TSN 的超参配置
-
-  # 数据集设置
-  dataset_type = 'RawframeDataset'  # 训练，验证，测试的数据集类型
-  data_root = 'data/kinetics400/rawframes_train/'  # 训练集的根目录
-  data_root_val = 'data/kinetics400/rawframes_val/'  # 验证集，测试集的根目录
-  ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'  # 训练集的标注文件
-  ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'  # 验证集的标注文件
-  ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'  # 测试集的标注文件
-
-  train_pipeline = [  # 训练数据前处理流水线步骤组成的列表
-      dict(  # SampleFrames 类的配置
-          type='SampleFrames',  # 选定采样哪些视频帧
-          clip_len=1,  # 每个输出视频片段的帧
-          frame_interval=1,  # 所采相邻帧的时序间隔
-          num_clips=3),  # 所采帧片段的数量
-      dict(  # RawFrameDecode 类的配置
-          type='RawFrameDecode'),  # 给定帧序列，加载对应帧，解码对应帧
-      dict(  # Resize 类的配置
-          type='Resize',  # 调整图片尺寸
-          scale=(-1, 256)),  # 调整比例
-      dict(  # MultiScaleCrop 类的配置
-          type='MultiScaleCrop',  # 多尺寸裁剪，随机从一系列给定尺寸中选择一个比例尺寸进行裁剪
-          input_size=224,  # 网络输入
-          scales=(1, 0.875, 0.75, 0.66),  # 长宽比例选择范围
-          random_crop=False,  # 是否进行随机裁剪
-          max_wh_scale_gap=1),  # 长宽最大比例间隔
-      dict(  # Resize 类的配置
-          type='Resize',  # 调整图片尺寸
-          scale=(224, 224),  # 调整比例
-          keep_ratio=False),  # 是否保持长宽比
-      dict(  # Flip 类的配置
-          type='Flip',  # 图片翻转
-          flip_ratio=0.5),  # 执行翻转几率
-      dict(  # FormatShape 类的配置
-          type='FormatShape',  # 将图片格式转变为给定的输入格式
-          input_format='NCHW'),  # 最终的图片组成格式
-      dict(  # PackActionInputs 类的配置
-          type='PackActionInputs')  # 将输入数据打包
-  ]
-  val_pipeline = [  # 验证数据前处理流水线步骤组成的列表
-      dict(  # SampleFrames 类的配置
-          type='SampleFrames',  # 选定采样哪些视频帧
-          clip_len=1,  # 每个输出视频片段的帧
-          frame_interval=1,  # 所采相邻帧的时序间隔
-          num_clips=3,  # 所采帧片段的数量
-          test_mode=True),  # 是否设置为测试模式采帧
-      dict(  # RawFrameDecode 类的配置
-          type='RawFrameDecode'),  # 给定帧序列，加载对应帧，解码对应帧
-      dict(  # Resize 类的配置
-          type='Resize',  # 调整图片尺寸
-          scale=(-1, 256)),  # 调整比例
-      dict(  # CenterCrop 类的配置
-          type='CenterCrop',  # 中心裁剪
-          crop_size=224),  # 裁剪部分的尺寸
-      dict(  # Flip 类的配置
-          type='Flip',  # 图片翻转
-          flip_ratio=0),  # 翻转几率
-      dict(  # FormatShape 类的配置
-          type='FormatShape',  # 将图片格式转变为给定的输入格式
-          input_format='NCHW'),  # 最终的图片组成格式
-      dict(  # PackActionInputs 类的配置
-          type='PackActionInputs')  # 将输入数据打包
-  ]
-  test_pipeline = [  # 测试数据前处理流水线步骤组成的列表
-      dict(  # SampleFrames 类的配置
-          type='SampleFrames',  # 选定采样哪些视频帧
-          clip_len=1,  # 每个输出视频片段的帧
-          frame_interval=1,  # 所采相邻帧的时序间隔
-          num_clips=25,  # 所采帧片段的数量
-          test_mode=True),  # 是否设置为测试模式采帧
-      dict(  # RawFrameDecode 类的配置
-          type='RawFrameDecode'),  # 给定帧序列，加载对应帧，解码对应帧
-      dict(  # Resize 类的配置
-          type='Resize',  # 调整图片尺寸
-          scale=(-1, 256)),  # 调整比例
-      dict(  # TenCrop 类的配置
-          type='TenCrop',  # 裁剪 10 个区域
-          crop_size=224),  # 裁剪部分的尺寸
-      dict(  # Flip 类的配置
-          type='Flip',  # 图片翻转
-          flip_ratio=0),  # 执行翻转几率
-      dict(  # FormatShape 类的配置
-          type='FormatShape',  # 将图片格式转变为给定的输入格式
-          input_format='NCHW'),  # 最终的图片组成格式
-      dict(  # PackActionInputs 类的配置
-          type='PackActionInputs')  # 将输入数据打包
-  ]
-
-  train_dataloader = dict(  # 训练过程 dataloader 的配置
-      batch_size=32,  # 训练过程单个 GPU 的批大小
-      num_workers=8,  # 训练过程单个 GPU 的 数据预取的进程数
-      persistent_workers=True,  # 保持`Dataset` 实例
-      sampler=dict(type='DefaultSampler', shuffle=True),
-      dataset=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=dict(video=data_root),
-        pipeline=train_pipeline))
-  val_dataloader = dict(  # 验证过程 dataloader 的配置
-      batch_size=1,  # 验证过程单个 GPU 的批大小
-      num_workers=8,  # 验证过程单个 GPU 的 数据预取的进程
-      persistent_workers=True,  # 保持`Dataset` 实例
-      sampler=dict(type='DefaultSampler', shuffle=False),
-      dataset=dict(
-          type=dataset_type,
-          ann_file=ann_file_val,
-          data_prefix=dict(video=data_root_val),
-          pipeline=val_pipeline,
-          test_mode=True))
-  test_dataloader = dict(  # 测试过程 dataloader 的配置
-      batch_size=32,  # 测试过程单个 GPU 的批大小
-      num_workers=8,  # 测试过程单个 GPU 的 数据预取的进程
-      persistent_workers=True,  # 保持`Dataset` 实例
-      sampler=dict(type='DefaultSampler', shuffle=False),
-      dataset=dict(
-          type=dataset_type,
-          ann_file=ann_file_val,
-          data_prefix=dict(video=data_root_val),
-          pipeline=test_pipeline,
-          test_mode=True))
-
-  # 评测器设置
-  val_evaluator = dict(type='AccMetric')  # 用于计算验证指标的评测对象
-  test_evaluator = dict(type='AccMetric')  # 用于计算测试指标的评测对象
-
-  train_cfg = dict(  # 训练循环的配置
-      type='EpochBasedTrainLoop',  # 训练循环的名称
-      max_epochs=100,  # 整体循环次数
-      val_begin=1,  # 开始验证的轮次
-      val_interval=1)  # 执行验证的间隔
-  val_cfg = dict(  # 验证循环的配置
-      type='ValLoop')  # 验证循环的名称
-  test_cfg = dict( # 测试循环的配置
-      type='TestLoop')  # 测试循环的名称
-
-  # 学习策略设置
-  param_scheduler = [  # 用于更新优化器参数的参数调度程序，支持字典或列表
-      dict(type='MultiStepLR',  # 当轮次数达到阈值，学习率衰减
-          begin=0,  # 开始更新学习率的步长
-          end=100,  # 停止更新学习率的步长
-          by_epoch=True,  # 学习率是否按轮次更新
-          milestones=[40, 80],  # 学习率衰减阈值
-          gamma=0.1)]  # 学习率衰减的乘数因子
-
-  # 优化器设置
-  optim_wrapper = dict(  # 优化器钩子的配置
-      type='OptimWrapper',  #  优化器封装的名称, 切换到 AmpOptimWrapper 可以实现混合精度训练
-      optimizer=dict(  # 优化器配置。 支持各种在pytorch上的优化器。 参考 https://pytorch.org/docs/stable/optim.html#algorithms
-          type='SGD',  # 优化器名称
-          lr=0.01,  # 学习率
-          momentum=0.9,  # 动量大小
-          weight_decay=0.0001)  # SGD 优化器权重衰减
-      clip_grad=dict(max_norm=40, norm_type=2))  # 梯度裁剪的配置
-
-  # 运行设置
-  default_scope = 'mmaction'  # 查找模块的默认注册表范围。 参考 https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
-  default_hooks = dict(  # 执行默认操作的钩子，如更新模型参数和保存checkpoints。
-      runtime_info=dict(type='RuntimeInfoHook'),  # 将运行信息更新到消息中心的钩子。
-      timer=dict(type='IterTimerHook'),  # 记录迭代期间花费时间的日志。
-      logger=dict(
-          type='LoggerHook',  # 记录训练/验证/测试阶段记录日志。
-          interval=20,  # 打印日志间隔
-          ignore_last=False), # 忽略每个轮次中最后一次迭代的日志
-      param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中一些超参数的钩子
-      checkpoint=dict(
-          type='CheckpointHook',  # 定期保存检查点的钩子
-          interval=3,  # 保存周期
-          save_best='auto',  # 在评估期间测量最佳检查点的指标
-          max_keep_ckpts=3),  # 要保留的最大检查点
-      sampler_seed=dict(type='DistSamplerSeedHook'),  # 分布式训练的数据加载采样器
-      sync_buffers=dict(type='SyncBuffersHook'))  # 在每个轮次结束时同步模型缓冲区
-  env_cfg = dict(  # 环境设置
-      cudnn_benchmark=False,  # 是否启用cudnn基准
-      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # 设置多线程处理的参数
-      dist_cfg=dict(backend='nccl')) # 设置分布式环境的参数，也可以设置端口
-
-  log_processor = dict(
-      type='LogProcessor',  # 用于格式化日志信息的日志处理器
-      window_size=20,  # 默认平滑间隔
-      by_epoch=True)  # 是否以epoch类型格式化日志
-  vis_backends = [  # 可视化后端列表
-      dict(type='LocalVisBackend')]  # 本地可视化后端
-  visualizer = dict(  # 可视化工具的配置
-      type='ActionVisualizer',  # 可视化工具的名称
-      vis_backends=vis_backends)
-  log_level = 'INFO'  # 日志记录级别
-  load_from = None  # 从给定路径加载模型checkpoint作为预训练模型。这不会恢复训练。
-  resume = False  # 是否从`load_from`中定义的checkpoint恢复。如果“load_from”为“None”，它将恢复“work_dir”中的最新的checkpoint。
-  ```
-
-### 时空动作检测的配置文件系统
-
-MMAction2 将模块化设计整合到配置文件系统中，以便于执行各种不同的实验。
-
-- 以 FastRCNN 为例
-
-  为了帮助用户理解 MMAction2 的完整配置文件结构，以及时空检测系统中的一些模块，这里以 FastRCNN 为例，给出其配置文件的注释。
-  对于每个模块的详细用法以及对应参数的选择，请参照 [API 文档](https://mmaction2.readthedocs.io/en/latest/api.html)。
-
-  ```python
-  # 模型设置
-  model = dict(  # 模型的配置
-      type='FastRCNN',  # 时空检测器类型
-      _scope_='mmdet',  # 当前配置的范围
-      backbone=dict(  # Backbone 字典设置
-          type='ResNet3dSlowOnly',  # Backbone 名
-          depth=50, # ResNet 模型深度
-          pretrained=None,   # 预训练模型的 url 或文件位置
-          pretrained2d=False, # 预训练模型是否为 2D 模型
-          lateral=False,  # backbone 是否有侧连接
-          num_stages=4, # ResNet 模型阶数
-          conv1_kernel=(1, 7, 7), # Conv1 卷积核尺寸
-          conv1_stride_t=1, # Conv1 时序步长
-          pool1_stride_t=1, # Pool1 时序步长
-          spatial_strides=(1, 2, 2, 1)),  # 每个 ResNet 阶的空间步长
-      roi_head=dict(  # roi_head 字典设置
-          type='AVARoIHead',  # roi_head 名
-          bbox_roi_extractor=dict(  # bbox_roi_extractor 字典设置
-              type='SingleRoIExtractor3D',  # bbox_roi_extractor 名
-              roi_layer_type='RoIAlign',  # RoI op 类型
-              output_size=8,  # RoI op 输出特征尺寸
-              with_temporal_pool=True), # 时序维度是否要经过池化
-          bbox_head=dict( # bbox_head 字典设置
-              type='BBoxHeadAVA', # bbox_head 名
-              in_channels=2048, # 输入特征通道数
-              num_classes=81, # 动作类别数 + 1（背景）
-              multilabel=True,  # 数据集是否多标签
-              dropout_ratio=0.5)),  # dropout 比率
-      data_preprocessor=dict(  # 数据预处理器的字典
-          type='ActionDataPreprocessor',  # 数据预处理器的名称
-          mean=[123.675, 116.28, 103.53],  # 不同通道归一化的均值
-          std=[58.395, 57.12, 57.375],  # 不同通道归一化的方差
-          format_shape='NCHW')),  # 最终图像形状
-      # 模型训练和测试的设置
-      train_cfg=dict(  # 训练 FastRCNN 的超参配置
-          rcnn=dict(  # rcnn 训练字典设置
-              assigner=dict(  # assigner 字典设置
-                  type='MaxIoUAssignerAVA', # assigner 名
-                  pos_iou_thr=0.9,  # 正样本 IoU 阈值, > pos_iou_thr -> positive
-                  neg_iou_thr=0.9,  # 负样本 IoU 阈值, < neg_iou_thr -> negative
-                  min_pos_iou=0.9), # 正样本最小可接受 IoU
-              sampler=dict( # sample 字典设置
-                  type='RandomSampler', # sampler 名
-                  num=32, # sampler 批大小
-                  pos_fraction=1, # sampler 正样本边界框比率
-                  neg_pos_ub=-1,  # 负样本数转正样本数的比率上界
-                  add_gt_as_proposals=True), # 是否添加 ground truth 为候选
-              pos_weight=1.0)), # 正样本 loss 权重
-      test_cfg=dict( # 测试 FastRCNN 的超参设置
-          rcnn=dict(rcnn=None))  # rcnn 测试字典设置
-
-  # 数据集设置
-  dataset_type = 'AVADataset' # 训练，验证，测试的数据集类型
-  data_root = 'data/ava/rawframes'  # 训练集的根目录
-  anno_root = 'data/ava/annotations'  # 标注文件目录
-
-  ann_file_train = f'{anno_root}/ava_train_v2.1.csv'  # 训练集的标注文件
-  ann_file_val = f'{anno_root}/ava_val_v2.1.csv'  # 验证集的标注文件
-
-  exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'  # 训练除外数据集文件路径
-  exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'  # 验证除外数据集文件路径
-
-  label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'  # 标签文件路径
-
-  proposal_file_train = f'{anno_root}/ava_dense_proposals_train.FAIR.recall_93.9.pkl'  # 训练样本检测候选框的文件路径
-  proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'  # 验证样本检测候选框的文件路径
-
-
-  train_pipeline = [  # 训练数据前处理流水线步骤组成的列表
-      dict(  # SampleFrames 类的配置
-          type='AVASampleFrames',  # 选定采样哪些视频帧
-          clip_len=4,  # 每个输出视频片段的帧
-          frame_interval=16), # 所采相邻帧的时序间隔
-      dict(  # RawFrameDecode 类的配置
-          type='RawFrameDecode'),  # 给定帧序列，加载对应帧，解码对应帧
-      dict(  # RandomRescale 类的配置
-          type='RandomRescale',   # 给定一个范围，进行随机短边缩放
-          scale_range=(256, 320)),   # RandomRescale 的短边缩放范围
-      dict(  # RandomCrop 类的配置
-          type='RandomCrop',   # 给定一个尺寸进行随机裁剪
-          size=256),   # 裁剪尺寸
-      dict(  # Flip 类的配置
-          type='Flip',  # 图片翻转
-          flip_ratio=0.5),  # 执行翻转几率
-      dict(  # FormatShape 类的配置
-          type='FormatShape',  # 将图片格式转变为给定的输入格式
-          input_format='NCTHW',  # 最终的图片组成格式
-          collapse=True),   # 去掉 N 梯度当 N == 1
-      dict(type='PackActionInputs')# 打包输入数据
-  ]
-
-  val_pipeline = [  # 验证数据前处理流水线步骤组成的列表
-      dict(  # SampleFrames 类的配置
-          type='AVASampleFrames',  # 选定采样哪些视频帧
-          clip_len=4,  # 每个输出视频片段的帧
-          frame_interval=16),  # 所采相邻帧的时序间隔
-      dict(  # RawFrameDecode 类的配置
-          type='RawFrameDecode'),  # 给定帧序列，加载对应帧，解码对应帧
-      dict(  # Resize 类的配置
-          type='Resize',  # 调整图片尺寸
-          scale=(-1, 256)),  # 调整比例
-      dict(  # FormatShape 类的配置
-          type='FormatShape',  # 将图片格式转变为给定的输入格式
-          input_format='NCTHW',  # 最终的图片组成格式
-          collapse=True),   # 去掉 N 梯度当 N == 1
-      dict(type='PackActionInputs') # 打包输入数据
-  ]
-
-  train_dataloader = dict(  # 训练过程 dataloader 的配置
-      batch_size=32,  # 训练过程单个 GPU 的批大小
-      num_workers=8,  # 训练过程单个 GPU 的 数据预取的进程
-      persistent_workers=True,  # 如果为“True”，则数据加载器不会在轮次结束后关闭工作进程，这可以加快训练速度
-      sampler=dict(
-          type='DefaultSampler', # 支持分布式和非分布式的DefaultSampler
-          shuffle=True), 随机打乱每个轮次的训练数据
-      dataset=dict(  # 训练数据集的配置
-          type=dataset_type,
-          ann_file=ann_file_train,  # 标注文件的路径
-          exclude_file=exclude_file_train,  # 不包括的标注文件路径
-          label_file=label_file,  # 标签文件的路径
-          data_prefix=dict(img=data_root),  # 帧路径的前缀
-          proposal_file=proposal_file_train,  # 行人检测框的路径
-          pipeline=train_pipeline))
-  val_dataloader = dict(  # 验证过程 dataloader 的配置
-      batch_size=1,  # 验证过程单个 GPU 的批大小
-      num_workers=8,  # 验证过程单个 GPU 的 数据预取的进程
-      persistent_workers=True,  # 保持`Dataset` 实例
-      sampler=dict(
-          type='DefaultSampler',
-          shuffle=False),  # 在验证测试期间不打乱数据
-      dataset=dict(  # 验证集的配置
-          type=dataset_type,
-          ann_file=ann_file_val,  # 标注文件的路径
-          exclude_file=exclude_file_train,  # 不包括的标注文件路径
-          label_file=label_file,  # 标签文件的路径
-          data_prefix=dict(video=data_root_val),  # 帧路径的前缀
-          proposal_file=proposal_file_val,  # # 行人检测框的路径
-          pipeline=val_pipeline,
-          test_mode=True))
-  test_dataloader = val_dataloader  # 测试过程 dataloader 的配置
-
-
-  # 评估器设置
-  val_evaluator = dict(  # 验证评估器的配置
-      type='AccMetric',
-      ann_file=ann_file_val,
-      label_file=label_file,
-      exclude_file=exclude_file_val)
-  test_evaluator = val_evaluator  # 测试评估器的配置
-
-  train_cfg = dict(  # 训练循环的配置
-      type='EpochBasedTrainLoop',  # 训练循环的名称
-      max_epochs=20,  # 整体循环次数
-      val_begin=1,  # 开始验证的轮次
-      val_interval=1)  # 执行验证的间隔
-  val_cfg = dict(  # 验证循环的配置
-      type='ValLoop')  # 验证循环的名称
-  test_cfg = dict( # 测试循环的配置
-      type='TestLoop')  # 测试循环的名称
-
-  # 学习策略设置
-  param_scheduler = [  # 用于更新优化器参数的参数调度程序，支持字典或列表
-      dict（type='LinearLR'，# 通过乘法因子线性衰减来降低各参数组的学习率
-          start_factor=0.1，# 乘以第一个轮次的学习率的数值
-          by_epoch=True，# 学习率是否按轮次更新
-          begin=0，# 开始更新学习率的步长
-          end=5），# 停止更新学习率的步长
-      dict(type='MultiStepLR',  # 当轮次数达到阈值，学习率衰减
-          begin=0,  # 开始更新学习率的步长
-          end=20,  # 停止更新学习率的步长
-          by_epoch=True,  # 学习率是否按轮次更新
-          milestones=[10, 15],  # 学习率衰减阈值
-          gamma=0.1)]  # 学习率衰减的乘数因子
-
-
-  # 优化器设置
-  optim_wrapper = dict(  # 优化器钩子的配置
-      type='OptimWrapper',  #  优化器封装的名称, 切换到 AmpOptimWrapper 可以实现混合精度训练
-      optimizer=dict(  # 优化器配置。 支持各种在pytorch上的优化器。 参考 https://pytorch.org/docs/stable/optim.html#algorithms
-          type='SGD',  # 优化器名称
-          lr=0.2,  # 学习率
-          momentum=0.9,  # 动量大小
-          weight_decay=0.0001)  # SGD 优化器权重衰减
-      clip_grad=dict(max_norm=40, norm_type=2))  # 梯度裁剪的配置
-
-  # 运行设置
-  default_scope = 'mmaction'  # 查找模块的默认注册表范围。 参考 https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
-  default_hooks = dict(  # 执行默认操作的钩子，如更新模型参数和保存checkpoints。
-      runtime_info=dict(type='RuntimeInfoHook'),  # 将运行信息更新到消息中心的钩子。
-      timer=dict(type='IterTimerHook'),  # 记录迭代期间花费时间的日志。
-      logger=dict(
-          type='LoggerHook',  # 记录训练/验证/测试阶段记录日志。
-          interval=20,  # 打印日志间隔
-          ignore_last=False), # 忽略每个轮次中最后一次迭代的日志
-      param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中一些超参数的钩子
-      checkpoint=dict(
-          type='CheckpointHook',  # 定期保存检查点的钩子
-          interval=3,  # 保存周期
-          save_best='auto',  # 在评估期间测量最佳检查点的指标
-          max_keep_ckpts=3),  # 要保留的最大检查点
-      sampler_seed=dict(type='DistSamplerSeedHook'),  # 分布式训练的数据加载采样器
-      sync_buffers=dict(type='SyncBuffersHook'))  # 在每个轮次结束时同步模型缓冲区
-  env_cfg = dict(  # 环境设置
-      cudnn_benchmark=False,  # 是否启用cudnn基准
-      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # 设置多线程处理的参数
-      dist_cfg=dict(backend='nccl')) # 设置分布式环境的参数，也可以设置端口
-
-  log_processor = dict(
-      type='LogProcessor',  # 用于格式化日志信息的日志处理器
-      window_size=20,  # 默认平滑间隔
-      by_epoch=True)  # 是否以epoch类型格式化日志
-  vis_backends = [  # 可视化后端列表
-      dict(type='LocalVisBackend')]  # 本地可视化后端
-  visualizer = dict(  # 可视化工具的配置
-      type='ActionVisualizer',  # 可视化工具的名称
-      vis_backends=vis_backends)
-  log_level = 'INFO'  # 日志记录级别
-  load_from = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
-               'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/'
-               'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth')  # 从给定路径加载模型checkpoint作为预训练模型。这不会恢复训练。
-  resume = False  # 是否从`load_from`中定义的checkpoint恢复。如果“load_from”为“None”，它将恢复“work_dir”中的最新的checkpoint。
-  ```
-
-### 时序动作检测的配置文件系统
-
-MMAction2 将模块化设计整合到配置文件系统中，以便于执行各种不同的实验。
-
-- 以 BMN 为例
-
-  为了帮助用户理解 MMAction2 的配置文件结构，以及时序动作检测系统中的一些模块，这里以 BMN 为例，给出其配置文件的注释。
-  对于每个模块的详细用法以及对应参数的选择，请参照 [API 文档](https://mmaction2.readthedocs.io/en/latest/api.html)。
-
-  ```python
-  # 模型设置
-  model = dict(  # 模型的配置
-      type='BMN',  # 时序动作检测器的类型
-      temporal_dim=100,  # 每个视频中所选择的帧数量
-      boundary_ratio=0.5,  # 视频边界的决策几率
-      num_samples=32,  # 每个候选的采样数
-      num_samples_per_bin=3,  # 每个样本的直方图采样数
-      feat_dim=400,  # 特征维度
-      soft_nms_alpha=0.4,  # soft-NMS 的 alpha 值
-      soft_nms_low_threshold=0.5,  # soft-NMS 的下界
-      soft_nms_high_threshold=0.9,  # soft-NMS 的上界
-      post_process_top_k=100)  # 后处理得到的最好的 K 个 proposal
-
-  # 数据集设置
-  dataset_type = 'ActivityNetDataset'  # 训练，验证，测试的数据集类型
-  data_root = 'data/activitynet_feature_cuhk/csv_mean_100/'  # 训练集的根目录
-  data_root_val = 'data/activitynet_feature_cuhk/csv_mean_100/'  # 验证集和测试集的根目录
-  ann_file_train = 'data/ActivityNet/anet_anno_train.json'  # 训练集的标注文件
-  ann_file_val = 'data/ActivityNet/anet_anno_val.json'  # 验证集的标注文件
-  ann_file_test = 'data/ActivityNet/anet_anno_test.json'  # 测试集的标注文件
-
-  train_pipeline = [  # 训练数据前处理流水线步骤组成的列表
-      dict(type='LoadLocalizationFeature'),  # 加载时序动作检测特征
-      dict(type='GenerateLocalizationLabels'),  # 生成时序动作检测标签
-      dict(
-          type='PackLocalizationInputs',  # 时序数据打包
-          keys=('gt_bbox'),  # 输入的键
-          meta_keys=('video_name'))]  # 输入的元键
-  val_pipeline = [  # 验证数据前处理流水线步骤组成的列表
-      dict(type='LoadLocalizationFeature'),  # 加载时序动作检测特征
-      dict(type='GenerateLocalizationLabels'),  # 生成时序动作检测标签
-      dict(
-          type='PackLocalizationInputs',  # 时序数据打包
-          keys=('gt_bbox'),  # 输入的键
-          meta_keys= ('video_name', 'duration_second', 'duration_frame',
-                      'annotations', 'feature_frame'))],  # 输入的元键
-  test_pipeline = [  # 测试数据前处理流水线步骤组成的列表
-      dict(type='LoadLocalizationFeature'),  # 加载时序动作检测特征
-      dict(
-          type='PackLocalizationInputs',  # 时序数据打包
-          keys=('gt_bbox'),  # 输入的键
-          meta_keys= ('video_name', 'duration_second', 'duration_frame',
-                      'annotations', 'feature_frame'))],  # 输入的元键
-  train_dataloader = dict(  # 训练过程 dataloader 的配置
-      batch_size=8,  # 训练过程单个 GPU 的批大小
-      num_workers=8,  # 训练过程单个 GPU 的 数据预取的进程
-      persistent_workers=True,  # 如果为“True”，则数据加载器不会在轮次结束后关闭工作进程，这可以加快训练速度
-      sampler=dict(
-          type='DefaultSampler', # 支持分布式和非分布式的DefaultSampler
-          shuffle=True), 随机打乱每个轮次的训练数据
-      dataset=dict(  # 训练数据集的配置
-        type=dataset_type,
-        ann_file=ann_file_train,  # 标签文件的路径
-        exclude_file=exclude_file_train,  # 不包括的标签文件路径
-        label_file=label_file,  # 标签文件的路径
-        data_prefix=dict(video=data_root),
-        data_prefix=dict(img=data_root),  # Prefix of frame path
-        pipeline=train_pipeline))
-  val_dataloader = dict(  # 验证过程 dataloader 的配置
-      batch_size=1,  # 验证过程单个 GPU 的批大小
-      num_workers=8,  # 验证过程单个 GPU 的 数据预取的进程
-      persistent_workers=True,  # 保持`Dataset` 实例
-      sampler=dict(
-          type='DefaultSampler',
-          shuffle=False),  # 在验证测试过程中不打乱数据
-      dataset=dict(  # 验证数据集的配置
-          type=dataset_type,
-          ann_file=ann_file_val,  # 标注文件的路径
-          data_prefix=dict(video=data_root_val),  # 视频路径的前缀
-          pipeline=val_pipeline,
-          test_mode=True))
-  test_dataloader = dict(  # 测试过程 dataloader 的配置
-      batch_size=1,  #测试过程单个 GPU 的批大小
-      num_workers=8,  # 测试过程单个 GPU 的 数据预取的进程
-      persistent_workers=True,  # 保持`Dataset` 实例
-      sampler=dict(
-          type='DefaultSampler',
-          shuffle=False),  # 在验证测试过程中不打乱数据
-      dataset=dict(  # 测试数据集的配置
-          type=dataset_type,
-          ann_file=ann_file_val,  # 标注文件的路径
-          data_prefix=dict(video=data_root_val),  # 视频路径的前缀
-          pipeline=test_pipeline,
-          test_mode=True))
-
-
-  # 评估器设置
-  work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/'  # 用于保存当前试验的模型检查点和日志的目录
-  val_evaluator = dict(  # 验证评估器的配置
-    type='AccMetric',
-    metric_type='AR@AN',
-    dump_config=dict(  # 时序输出的配置
-        out=f'{work_dir}/results.json',  # 输出文件的路径
-        output_format='json'))  # 输出文件的文件格式
-  test_evaluator = val_evaluator  # 测试评估器的配置
-
-  max_epochs = 9  # Total epochs to train the model
-  train_cfg = dict(  # 训练循环的配置
-     type='EpochBasedTrainLoop',  # 训练循环的名称
-     max_epochs=100,  # 整体循环次数
-     val_begin=1,  # 开始验证的轮次
-     val_interval=1)  # 执行验证的间隔
-  val_cfg = dict(  # 验证循环的配置
-     type='ValLoop')  # 验证循环的名称
-  test_cfg = dict( # 测试循环的配置
-     type='TestLoop')  # 测试循环的名称
-
-  # 学习策略设置
-  param_scheduler = [  # 用于更新优化器参数的参数调度程序，支持字典或列表
-     dict(type='MultiStepLR',  # 当轮次数达到阈值，学习率衰减
-     begin=0,  # 开始更新学习率的步长
-     end=max_epochs,  # 停止更新学习率的步长
-     by_epoch=True,  # 学习率是否按轮次更新
-     milestones=[7, ],  # 学习率衰减阈值
-     gamma=0.1)]  # 学习率衰减的乘数因子
-
-  # 优化器设置
-  optim_wrapper = dict(  # 优化器钩子的配置
-    type='OptimWrapper',  #  优化器封装的名称, 切换到 AmpOptimWrapper 可以实现混合精度训练
-    optimizer=dict(  # 优化器配置。 支持各种在pytorch上的优化器。 参考 https://pytorch.org/docs/stable/optim.html#algorithms
-      type='Adam',  # 优化器名称
-      lr=0.001,  # 学习率
-      weight_decay=0.0001)  # 权重衰减
-    clip_grad=dict(max_norm=40, norm_type=2))  # 梯度裁剪的配置
-
-  # 运行设置
-  default_scope = 'mmaction'  # 查找模块的默认注册表范围。 参考 https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
-  default_hooks = dict(  # 执行默认操作的钩子，如更新模型参数和保存checkpoints。
-      runtime_info=dict(type='RuntimeInfoHook'),  # 将运行信息更新到消息中心的钩子。
-      timer=dict(type='IterTimerHook'),  # 记录迭代期间花费时间的日志。
-      logger=dict(
-          type='LoggerHook',  # 记录训练/验证/测试阶段记录日志。
-          interval=20,  # 打印日志间隔
-          ignore_last=False), # 忽略每个轮次中最后一次迭代的日志
-      param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中一些超参数的钩子
-      checkpoint=dict(
-          type='CheckpointHook',  # 定期保存检查点的钩子
-          interval=3,  # 保存周期
-          save_best='auto',  # 在评估期间测量最佳检查点的指标
-          max_keep_ckpts=3),  # 要保留的最大检查点
-      sampler_seed=dict(type='DistSamplerSeedHook'),  # 分布式训练的数据加载采样器
-      sync_buffers=dict(type='SyncBuffersHook'))  # 在每个轮次结束时同步模型缓冲区
-  env_cfg = dict(  # 环境设置
-      cudnn_benchmark=False,  # 是否启用cudnn基准
-      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # 设置多线程处理的参数
-      dist_cfg=dict(backend='nccl')) # 设置分布式环境的参数，也可以设置端口
-
-  log_processor = dict(
-      type='LogProcessor',  # 用于格式化日志信息的日志处理器
-      window_size=20,  # 默认平滑间隔
-      by_epoch=True)  # 是否以epoch类型格式化日志
-  vis_backends = [  # 可视化后端列表
-      dict(type='LocalVisBackend')]  # 本地可视化后端
-  visualizer = dict(  # 可视化工具的配置
-      type='ActionVisualizer',  # 可视化工具的名称
-      vis_backends=vis_backends)
-  log_level = 'INFO'  # 日志记录级别
-  load_from = None  # 从给定路径加载模型checkpoint作为预训练模型。这不会恢复训练。
-  resume = False  # 是否从`load_from`中定义的checkpoint恢复。如果“load_from”为“None”，它将恢复“work_dir”中的最新的checkpoint。
-  ```
diff --git a/docs/zh_cn/user_guides/2_data_prepare.md b/docs/zh_cn/user_guides/2_data_prepare.md
deleted file mode 100644
index a2312461da..0000000000
--- a/docs/zh_cn/user_guides/2_data_prepare.md
+++ /dev/null
@@ -1 +0,0 @@
-# 教程 2：准备数据集（内容建设中）
diff --git a/docs/zh_cn/user_guides/3_inference.md b/docs/zh_cn/user_guides/3_inference.md
deleted file mode 100644
index d7c1cc4173..0000000000
--- a/docs/zh_cn/user_guides/3_inference.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# 教程3：利用现有模型进行推理
-
-MMAction2 在 [Model Zoo](../modelzoo.md) 中提供预训练的视频理解模型。
-本教程将展示**如何使用现有模型对给定视频进行推理**。
-
-至于如何在标准数据集上测试现有模型，请参阅这该[指南](./train_test.md#test)
-
-## 给定视频的推理
-
-MMAction2提供了高级 Python APIs，用于对给定视频进行推理:
-
-- [init_recognizer](mmaction.apis.init_recognizer): 用配置和检查点初始化一个识别器。
-- [inference_recognizer](mmaction.apis.inference_recognizer): 对给定视频进行推理。
-
-下面是一个使用 Kinetics-400 预训练检查点在给定视频上构建模型和推理的示例。
-
-```{note}
-如果使用mmaction2作为第三方包，则需要下载示例中的config和演示视频。
-
-下载所需的配置：'mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .'
-
-下载所需的演示视频：'wget https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.mp4'
-```
-
-```python
-from mmaction.apis import inference_recognizer, init_recognizer
-
-config_path = 'configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'
-checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth' # 可以是本地路径
-img_path = 'demo/demo.mp4'   # 您可以指定自己的视频路径
-
-# 从配置文件和检查点文件构建模型
-model = init_recognizer(config_path, checkpoint_path, device="cpu")  # 也可以是 'cuda:0'
-# 测试单个视频
-result = inference_recognizer(model, img_path)
-```
-
-`result` 是一个包含 `pred_scores` 的字典。动作识别示例代码详见 [demo/demo.py](https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.py)。
diff --git a/docs/zh_cn/user_guides/4_train_test.md b/docs/zh_cn/user_guides/4_train_test.md
deleted file mode 100644
index 70973799e2..0000000000
--- a/docs/zh_cn/user_guides/4_train_test.md
+++ /dev/null
@@ -1 +0,0 @@
-# 教程 4：训练与测试（内容建设中）
diff --git a/docs/zh_cn/user_guides/config.md b/docs/zh_cn/user_guides/config.md
new file mode 100644
index 0000000000..fe436f623c
--- /dev/null
+++ b/docs/zh_cn/user_guides/config.md
@@ -0,0 +1,711 @@
+# 学习配置文件
+
+我们使用 Python 文件作为配置文件，将模块化和继承设计融入我们的配置系统中，这方便进行各种实验。
+您可以在 `$MMAction2/configs` 目录下找到所有提供的配置文件。如果您想要查看配置文件，
+您可以运行 `python tools/analysis_tools/print_config.py /PATH/TO/CONFIG` 来查看完整的配置文件。
+
+<!-- TOC -->
+
+- [学习配置文件](#学习配置文件)
+  - [通过脚本参数修改配置](#通过脚本参数修改配置)
+  - [配置文件结构](#配置文件结构)
+  - [配置文件命名约定](#配置文件命名约定)
+    - [动作识别的配置系统](#动作识别的配置系统)
+    - [时空动作检测的配置系统](#时空动作检测的配置系统)
+    - [动作定位的配置系统](#动作定位的配置系统)
+
+<!-- TOC -->
+
+## 通过脚本参数修改配置
+
+在使用 `tools/train.py` 或 `tools/test.py` 提交作业时，您可以通过指定 `--cfg-options` 来原地修改配置。
+
+- 更新字典的配置键。
+
+  可以按照原始配置中字典键的顺序来指定配置选项。
+  例如，`--cfg-options model.backbone.norm_eval=False` 将模型骨干中的所有 BN 模块更改为 `train` 模式。
+
+- 更新配置列表中的键。
+
+  一些配置字典在配置文件中以列表形式组成。例如，训练流程 `train_pipeline` 通常是一个列表，
+  例如 `[dict(type='SampleFrames'), ...]`。如果您想要在流程中将 `'SampleFrames'` 更改为 `'DenseSampleFrames'`，
+  您可以指定 `--cfg-options train_pipeline.0.type=DenseSampleFrames`。
+
+- 更新列表/元组的值。
+
+  如果要更新的值是列表或元组。例如，配置文件通常设置 `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`。如果您想要
+  更改此键，您可以指定 `--cfg-options model.data_preprocessor.mean="[128,128,128]"`。请注意，引号 " 是支持列表/元组数据类型的必需内容。
+
+## 配置文件结构
+
+`configs/_base_` 下有 3 种基本组件类型，即 models、schedules 和 default_runtime。
+许多方法只需要一个模型、一个训练计划和一个默认运行时组件就可以轻松构建，如 TSN、I3D、SlowOnly 等。
+由 `_base_` 组件组成的配置文件被称为 _primitive_。
+
+对于同一文件夹下的所有配置文件，建议只有**一个** _primitive_ 配置文件。其他所有配置文件都应该继承自 _primitive_ 配置文件。这样，继承级别的最大值为 3。
+
+为了方便理解，我们建议贡献者继承现有方法。
+例如，如果基于 TSN 进行了一些修改，用户可以首先通过指定 `_base_ = ../tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 来继承基本的 TSN 结构，然后在配置文件中修改必要的字段。
+
+如果您正在构建一个与任何现有方法的结构不共享的全新方法，可以在 `configs/TASK` 下创建一个文件夹。
+
+请参考 [mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) 获取详细文档。
+
+## 配置文件命名约定
+
+我们遵循以下样式来命名配置文件。建议贡献者遵循相同的样式。配置文件名分为几个部分，不同部分逻辑上用下划线 `'_'` 连接，同一部分的设置用破折号 `'-'` 连接。
+
+```
+{算法信息}_{模块信息}_{训练信息}_{数据信息}.py
+```
+
+`{xxx}` 是必填字段，`[yyy]` 是可选字段。
+
+- `{算法信息}`:
+  - `{模型}`: 模型类型，例如 `tsn`、`i3d`、`swin`、`vit` 等。
+  - `[模型设置]`: 某些模型的特定设置，例如 `base`、`p16`、`w877` 等。
+- `{模块信息}`:
+  - `[预训练信息]`: 预训练信息，例如 `kinetics400-pretrained`、`in1k-pre` 等。
+  - `{骨干网络}`: 骨干网络类型，例如 `r50`（ResNet-50）等。
+  - `[骨干网络设置]`: 某些骨干网络的特定设置，例如 `nl-dot-product`、`bnfrozen`、`nopool` 等。
+- `{训练信息}`:
+  - `{gpu x batch_per_gpu]}`: GPU 和每个 GPU 上的样本数。
+  - `{pipeline设置}`: 帧采样设置，例如 `dense`、`{clip_len}x{frame_interval}x{num_clips}`、`u48` 等。
+  - `{schedule}`: 训练计划，例如 `coslr-20e`。
+- `{数据信息}`:
+  - `{数据集}`: 数据集名称，例如 `kinetics400`、`mmit` 等。
+  - `{模态}`: 数据模态，例如 `rgb`、`flow`、`keypoint-2d` 等。
+
+### 动作识别的配置系统
+
+我们将模块化设计融入我们的配置系统中，
+这方便进行各种实验。
+
+- TSN 的示例
+
+  为了帮助用户对完整的配置结构和动作识别系统中的模块有一个基本的了解，
+  我们对 TSN 的配置进行简要注释如下。有关每个模块中每个参数的更详细用法和替代方法，请参阅 API 文档。
+
+  ```python
+  # 模型设置
+  model = dict(  # 模型的配置
+      type='Recognizer2D',  # 识别器的类名
+      backbone=dict(  # 骨干网络的配置
+          type='ResNet',  # 骨干网络的名称
+          pretrained='torchvision://resnet50',  # 预训练模型的 URL/网站
+          depth=50,  # ResNet 模型的深度
+          norm_eval=False),  # 是否在训练时将 BN 层设置为评估模式
+      cls_head=dict(  # 分类头的配置
+          type='TSNHead',  # 分类头的名称
+          num_classes=400,  # 要分类的类别数量。
+          in_channels=2048,  # 分类头的输入通道数。
+          spatial_type='avg',  # 空间维度池化的类型
+          consensus=dict(type='AvgConsensus', dim=1),  # 一致性模块的配置
+          dropout_ratio=0.4,  # dropout 层中的概率
+          init_std=0.01, # 线性层初始化的标准差值
+          average_clips='prob'),  # 平均多个剪辑结果的方法
+      data_preprocessor=dict(  # 数据预处理器的配置
+          type='ActionDataPreprocessor',  # 数据预处理器的名称
+          mean=[123.675, 116.28, 103.53],  # 不同通道的均值用于归一化
+          std=[58.395, 57.12, 57.375],  # 不同通道的标准差用于归一化
+          format_shape='NCHW'),  # 最终图像形状的格式
+      # 模型训练和测试设置
+      train_cfg=None,  # TSN 的训练超参数的配置
+      test_cfg=None)  # TSN 的测试超参数的配置
+
+  # 数据集设置
+  dataset_type = 'RawframeDataset'  # 用于训练、验证和测试的数据集类型
+  data_root = 'data/kinetics400/rawframes_train/'  # 用于训练的数据的根路径
+  data_root_val = 'data/kinetics400/rawframes_val/'  # 用于验证和测试的数据的根路径
+  ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'  # 用于训练的注释文件的路径
+  ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'  # 用于验证的注释文件的路径
+  ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'  # 用于测试的注释文件的路径
+
+  train_pipeline = [  # 训练数据处理流程
+      dict(  # SampleFrames 的配置
+          type='SampleFrames',  # 采样帧的流程，从视频中采样帧
+          clip_len=1,  # 每个采样输出剪辑的帧数
+          frame_interval=1,  # 相邻采样帧的时间间隔
+          num_clips=3),  # 要采样的剪辑数
+      dict(  # RawFrameDecode 的配置
+          type='RawFrameDecode'),  # 加载和解码帧的流程，选择给定索引的原始帧
+      dict(  # Resize 的配置
+          type='Resize',  # 调整大小的流程
+          scale=(-1, 256)),  # 要调整图像的比例
+      dict(  # MultiScaleCrop 的配置
+          type='MultiScaleCrop',  # 多尺度裁剪的流程，根据随机选择的尺度列表裁剪图像
+          input_size=224,  # 网络的输入大小
+          scales=(1, 0.875, 0.75, 0.66),  # 要选择的宽度和高度的尺度
+          random_crop=False,  # 是否随机采样裁剪框
+          max_wh_scale_gap=1),  # 宽度和高度尺度级别的最大差距
+      dict(  # Resize 的配置
+          type='Resize',  # 调整大小的流程
+          scale=(224, 224),  # 要调整图像的比例
+          keep_ratio=False),  # 是否保持纵横比进行调整大小
+      dict(  # Flip 的配置
+          type='Flip',  # 翻转的流程
+          flip_ratio=0.5),  # 实施翻转的概率
+      dict(  # FormatShape 的配置
+          type='FormatShape',  # 格式化形状的流程，将最终图像形状格式化为给定的 input_format
+          input_format='NCHW'),  # 最终图像形状的格式
+      dict(type='PackActionInputs')  # PackActionInputs 的配置
+  ]
+  val_pipeline = [  # 验证数据处理流程
+      dict(  # SampleFrames 的配置
+          type='SampleFrames',  # 采样帧的流程，从视频中采样帧
+          clip_len=1,  # 每个采样输出剪辑的帧数
+          frame_interval=1,  # 相邻采样帧的时间间隔
+          num_clips=3,  # 要采样的剪辑数
+          test_mode=True),  # 是否在采样时设置为测试模式
+      dict(  # RawFrameDecode 的配置
+          type='RawFrameDecode'),  # 加载和解码帧的流程，选择给定索引的原始帧
+      dict(  # Resize 的配置
+          type='Resize',  # 调整大小的流程
+          scale=(-1, 256)),  # 要调整图像的比例
+      dict(  # CenterCrop 的配置
+          type='CenterCrop',  # 中心裁剪的流程，从图像中裁剪中心区域
+          crop_size=224),  # 要裁剪的图像大小
+      dict(  # Flip 的配置
+          type='Flip',  # 翻转的流程
+          flip_ratio=0),  # 实施翻转的概率
+      dict(  # FormatShape 的配置
+          type='FormatShape',  # 格式化形状的流程，将最终图像形状格式化为给定的 input_format
+          input_format='NCHW'),  # 最终图像形状的格式
+      dict(type='PackActionInputs')  # PackActionInputs 的配置
+  ]
+  test_pipeline = [  # 测试数据处理流程
+      dict(  # SampleFrames 的配置
+          type='SampleFrames',  # 采样帧的流程，从视频中采样帧
+          clip_len=1,  # 每个采样输出剪辑的帧数
+          frame_interval=1,  # 相邻采样帧的时间间隔
+          num_clips=25,  # 要采样的剪辑数
+          test_mode=True),  # 是否在采样时设置为测试模式
+      dict(  # RawFrameDecode 的配置
+          type='RawFrameDecode'),  # 加载和解码帧的流程，选择给定索引的原始帧
+      dict(  # Resize 的配置
+          type='Resize',  # 调整大小的流程
+          scale=(-1, 256)),  # 要调整图像的比例
+      dict(  # TenCrop 的配置
+          type='TenCrop',  # 十次裁剪的流程，从图像中裁剪十个区域
+          crop_size=224),  # 要裁剪的图像大小
+      dict(  # Flip 的配置
+          type='Flip',  # 翻转的流程
+          flip_ratio=0),  # 实施翻转的概率
+      dict(  # FormatShape 的配置
+          type='FormatShape',  # 格式化形状的流程，将最终图像形状格式化为给定的 input_format
+          input_format='NCHW'),  # 最终图像形状的格式
+      dict(type='PackActionInputs')  # PackActionInputs 的配置
+  ]
+
+  train_dataloader = dict(  # 训练数据加载器的配置
+      batch_size=32,  # 训练时每个单个 GPU 的批量大小
+      num_workers=8,  # 训练时每个单个 GPU 的数据预取进程数
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭工作进程，这可以加速训练速度
+      sampler=dict(
+          type='DefaultSampler',  # 支持分布式和非分布式训练的 DefaultSampler。参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py
+          shuffle=True),  # 每个 epoch 随机打乱训练数据
+      dataset=dict(  # 训练数据集的配置
+          type=dataset_type,
+          ann_file=ann_file_train,  # 注释文件的路径
+          data_prefix=dict(img=data_root),  # 帧路径的前缀
+          pipeline=train_pipeline))
+  val_dataloader = dict(  # 验证数据加载器的配置
+      batch_size=1,  # 验证时每个单个 GPU 的批量大小
+      num_workers=8,  # 验证时每个单个 GPU 的数据预取进程数
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭工作进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 验证和测试时不进行随机打乱
+      dataset=dict(  # 验证数据集的配置
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          data_prefix=dict(img=data_root_val),  # 帧路径的前缀
+          pipeline=val_pipeline,
+          test_mode=True))
+  test_dataloader = dict(  # 测试数据加载器的配置
+      batch_size=32,  # 测试时每个单个 GPU 的批量大小
+      num_workers=8,  # 测试时每个单个 GPU 的数据预取进程数
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭工作进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 验证和测试时不进行随机打乱
+      dataset=dict(  # 测试数据集的配置
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          data_prefix=dict(img=data_root_val),  # 帧路径的前缀
+          pipeline=test_pipeline,
+          test_mode=True))
+
+  # 评估设置
+  val_evaluator = dict(type='AccMetric')  # 验证评估器的配置
+  test_evaluator = val_evaluator  # 测试评估器的配置
+
+  train_cfg = dict(  # 训练循环的配置
+      type='EpochBasedTrainLoop',  # 训练循环的名称
+      max_epochs=100,  # 总的训练周期数
+      val_begin=1,  # 开始验证的训练周期
+      val_interval=1)  # 验证间隔
+  val_cfg = dict(  # 验证循环的配置
+      type='ValLoop')  # 验证循环的名称
+  test_cfg = dict( # 测试循环的配置
+      type='TestLoop')  # 测试循环的名称
+
+  # 学习策略
+  param_scheduler = [  # 更新优化器参数的学习率测率，支持字典或列表
+      dict(type='MultiStepLR',  # 达到一个里程碑时衰减学习率
+          begin=0,  # 开始更新学习率的步骤
+          end=100,  # 结束更新学习率的步骤
+          by_epoch=True,  # 是否按 epoch 更新学习率
+          milestones=[40, 80],  # 衰减学习率的步骤
+          gamma=0.1)]  # 学习率衰减的乘法因子
+
+  # 优化器
+  optim_wrapper = dict(  # 优化器包装器的配置
+      type='OptimWrapper',  # 优化器包装器的名称，切换到 AmpOptimWrapper 可以启用混合精度训练
+      optimizer=dict(  # 优化器的配置。支持 PyTorch 中的各种优化器。参考 https://pytorch.org/docs/stable/optim.html#algorithms
+          type='SGD',  # 优化器的名称
+          lr=0.01,  # 学习率
+          momentum=0.9,  # 动量因子
+          weight_decay=0.0001),  # 权重衰减
+      clip_grad=dict(max_norm=40, norm_type=2))  # 梯度裁剪的配置
+
+  # 运行时设置
+  default_scope = 'mmaction'  # 用于查找模块的默认注册表作用域。参考 https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
+  default_hooks = dict(  # 执行默认操作的钩子，如更新模型参数和保存权重。
+      runtime_info=dict(type='RuntimeInfoHook'),  # 将运行时信息更新到消息中心的钩子
+      timer=dict(type='IterTimerHook'),  # 用于记录迭代过程中花费的时间的日志记录器
+      logger=dict(
+          type='LoggerHook',  # 用于记录训练/验证/测试阶段的日志记录器
+          interval=20,  # 打印日志的间隔
+          ignore_last=False), # 忽略每个 epoch 中最后几个迭代的日志
+      param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中某些超参数的钩子
+      checkpoint=dict(
+          type='CheckpointHook',  # 定期保存权重的钩子
+          interval=3,  # 保存的周期
+          save_best='auto',  # 用于评估最佳权重的指标
+          max_keep_ckpts=3),  # 保留的最大权重文件数量
+      sampler_seed=dict(type='DistSamplerSeedHook'),  # 用于分布式训练的数据加载采样器
+      sync_buffers=dict(type='SyncBuffersHook'))  # 在每个 epoch 结束时同步模型缓冲区
+
+  env_cfg = dict(  # 设置环境的字典
+      cudnn_benchmark=False,  # 是否启用 cudnn benchmark
+      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), # 设置多进程的参数
+      dist_cfg=dict(backend='nccl')) # 设置分布式环境的参数，也可以设置端口号
+
+  log_processor = dict(
+      type='LogProcessor',  # 用于格式化日志信息的日志处理器
+      window_size=20,  # 默认的平滑间隔
+      by_epoch=True)  # 是否使用 epoch 类型格式化日志
+  vis_backends = [  # 可视化后端的列表
+      dict(type='LocalVisBackend')]  # 本地可视化后端
+  visualizer = dict(  # 可视化器的配置
+      type='ActionVisualizer',  # 可视化器的名称
+      vis_backends=vis_backends)
+  log_level = 'INFO'  # 日志记录的级别
+  load_from = None  # 从给定路径加载模型权重作为预训练模型。这不会恢复训练。
+  resume = False  # 是否从 `load_from` 中定义的权重恢复。如果 `load_from` 为 None，则会从 `work_dir` 中恢复最新的权重。
+  ```
+
+### 时空动作检测的配置系统
+
+我们将模块化设计融入我们的配置系统中，这方便进行各种实验。
+
+- FastRCNN 的示例
+
+  为了帮助用户对完整的配置结构和时空动作检测系统中的模块有一个基本的了解，
+  我们对 FastRCNN 的配置进行简要注释如下。有关每个模块中每个参数的更详细用法和替代方法，请参阅 API 文档。
+
+  ```python
+  # 模型设置
+  model = dict(  # 模型的配置
+      type='FastRCNN',  # 检测器的类名
+      _scope_='mmdet',  # 当前配置的范围
+      backbone=dict(  # 骨干网络的配置
+          type='ResNet3dSlowOnly',  # 骨干网络的名称
+          depth=50, # ResNet 模型的深度
+          pretrained=None,   # 预训练模型的 URL/网站
+          pretrained2d=False, # 如果预训练模型是 2D 的
+          lateral=False,  # 如果骨干网络带有横向连接
+          num_stages=4, # ResNet 模型的阶段数
+          conv1_kernel=(1, 7, 7), # Conv1 的卷积核大小
+          conv1_stride_t=1, # Conv1 的时间步长
+          pool1_stride_t=1, # Pool1 的时间步长
+          spatial_strides=(1, 2, 2, 1)),  # 每个 ResNet 阶段的空间步长
+      roi_head=dict(  # roi_head 的配置
+          type='AVARoIHead',  # roi_head 的名称
+          bbox_roi_extractor=dict(  # bbox_roi_extractor 的配置
+              type='SingleRoIExtractor3D',  # bbox_roi_extractor 的名称
+              roi_layer_type='RoIAlign',  # RoI 操作的类型
+              output_size=8,  # RoI 操作的输出特征大小
+              with_temporal_pool=True), # 是否进行时间维度的池化
+          bbox_head=dict( # bbox_head 的配置
+              type='BBoxHeadAVA', # bbox_head 的名称
+              in_channels=2048, # 输入特征的通道数
+              num_classes=81, # 动作类别数 + 1
+              multilabel=True,  # 数据集是否为多标签
+              dropout_ratio=0.5),  # 使用的 dropout 比例
+      data_preprocessor=dict(  # 数据预处理器的配置
+          type='ActionDataPreprocessor',  # 数据预处理器的名称
+          mean=[123.675, 116.28, 103.53],  # 不同通道的均值用于归一化
+          std=[58.395, 57.12, 57.375],  # 不同通道的标准差用于归一化
+          format_shape='NCHW'))  # 最终图像形状的格式
+      train_cfg=dict(
+          rcnn=dict(
+              assigner=dict(
+                  type='MaxIoUAssignerAVA',  # 分配器的名称
+                  pos_iou_thr=0.9,  # 正样本的 IoU 阈值，> pos_iou_thr -> 正样本
+                  neg_iou_thr=0.9,  # 负样本的 IoU 阈值，< neg_iou_thr -> 负样本
+                  min_pos_iou=0.9),  # 正样本的最小可接受 IoU
+              sampler=dict(
+                  type='RandomSampler',  # 采样器的名称
+                  num=32,  # 采样器的批处理大小
+                  pos_fraction=1,  # 采样器的正样本比例
+                  neg_pos_ub=-1,  # 负样本与正样本数量比率的上限
+                  add_gt_as_proposals=True),  # 将 gt 边界框添加到 proposals 中
+              pos_weight=1.0)),  # 正样本的损失权重
+      test_cfg=dict(rcnn=None))  # 测试的配置
+
+  # 数据集设置
+  dataset_type = 'AVADataset'  # 训练、验证和测试的数据集类型
+  data_root = 'data/ava/rawframes'  # 数据的根目录
+  anno_root = 'data/ava/annotations'  # 注释的根目录
+
+  ann_file_train = f'{anno_root}/ava_train_v2.1.csv'  # 训练注释文件的路径
+  ann_file_val = f'{anno_root}/ava_val_v2.1.csv'  # 验证注释文件的路径
+
+  exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'  # 训练排除注释文件的路径
+  exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'  # 验证排除注释文件的路径
+
+  label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'  # 标签文件的路径
+
+  proposal_file_train = f'{anno_root}/ava_dense_proposals_train.FAIR.recall_93.9.pkl'  # 训练示例的人体检测 proposals 文件的路径
+  proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'  # 验证示例的人体检测 proposals 文件的路径
+
+  train_pipeline = [
+      dict(
+          type='AVASampleFrames',  # 从视频中采样帧的管道
+          clip_len=4,  # 每个采样输出的帧数
+          frame_interval=16),  # 相邻采样帧之间的时间间隔
+      dict(
+          type='RawFrameDecode'),  # 加载和解码帧的管道，使用给定的索引选择原始帧
+      dict(
+          type='RandomRescale',  # 随机缩放短边
+          scale_range=(256, 320)),  # 随机缩放的短边尺寸范围
+      dict(
+          type='RandomCrop',  # 随机裁剪给定大小的补丁
+          size=256),  # 裁剪补丁的大小
+      dict(
+          type='Flip',  # 翻转管道
+          flip_ratio=0.5),  # 翻转的概率
+      dict(
+          type='FormatShape',  # 格式化形状的管道，将最终图像形状格式化为给定的输入格式
+          input_format='NCTHW',  # 最终图像形状的格式
+          collapse=True),  # 如果 N == 1，则减少维度 N
+      dict(type='PackActionInputs')  # 打包输入数据
+  ]
+
+  val_pipeline = [
+      dict(
+          type='AVASampleFrames',  # 从视频中采样帧的管道
+          clip_len=4,  # 每个采样输出的帧数
+          frame_interval=16),  # 相邻采样帧之间的时间间隔
+      dict(
+          type='RawFrameDecode'),  # 加载和解码帧的管道，使用给定的索引选择原始帧
+      dict(
+          type='Resize',  # 调整大小的管道
+          scale=(-1, 256)),  # 调整图像的尺度
+      dict(
+          type='FormatShape',  # 格式化形状的管道，将最终图像形状格式化为给定的输入格式
+          input_format='NCTHW',  # 最终图像形状的格式
+          collapse=True),  # 如果 N == 1，则减少维度 N
+      dict(type='PackActionInputs')  # 打包输入数据
+  ]
+
+  train_dataloader = dict(
+      batch_size=32,  # 每个单 GPU 训练的批处理大小
+      num_workers=8,  # 每个单 GPU 训练时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程，这可以加快训练速度
+      sampler=dict(
+          type='DefaultSampler',  # 默认采样器，支持分布式和非分布式训练。参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py
+          shuffle=True),  # 在每个 epoch 中随机打乱训练数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_train,  # 注释文件的路径
+          exclude_file=exclude_file_train,  # 排除注释文件的路径
+          label_file=label_file,  # 标签文件的路径
+          data_prefix=dict(img=data_root),  # 帧路径的前缀
+          proposal_file=proposal_file_train,  # 人体检测 proposals 的路径
+          pipeline=train_pipeline)
+  )
+  val_dataloader = dict(
+      batch_size=1,  # 每个单 GPU 评估的批处理大小
+      num_workers=8,  # 每个单 GPU 评估时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 在验证和测试时不打乱数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          exclude_file=exclude_file_val,  # 排除注释文件的路径
+          label_file=label_file,  # 标签文件的路径
+          data_prefix=dict(img=data_root_val),  # 帧路径的前缀
+          proposal_file=proposal_file_val,  # 人体检测 proposals 的路径
+          pipeline=val_pipeline,
+          test_mode=True)
+  )
+  test_dataloader = val_dataloader  # 测试数据加载器的配置
+
+  # 评估设置
+  val_evaluator = dict(
+      type='AVAMetric',
+      ann_file=ann_file_val,
+      label_file=label_file,
+      exclude_file=exclude_file_val)
+  test_evaluator = val_evaluator  # 测试评估器的配置
+
+  train_cfg = dict(
+      type='EpochBasedTrainLoop',  # 训练循环的名称
+      max_epochs=20,  # 总的训练 epoch 数量
+      val_begin=1,  # 开始验证的 epoch
+      val_interval=1)  # 验证的间隔
+  val_cfg = dict(
+      type='ValLoop')  # 验证循环的名称
+  test_cfg = dict(
+      type='TestLoop')  # 测试循环的名称
+
+  # 学习策略
+  param_scheduler = [
+      dict(
+          type='LinearLR',  # 线性减少每个参数组的学习率
+          start_factor=0.1,  # 第一个 epoch 中学习率的乘法因子
+          by_epoch=True,  # 是否按 epoch 更新学习率
+          begin=0,  # 开始更新学习率的步骤
+          end=5),  # 停止更新学习率的步骤
+      dict(
+          type='MultiStepLR',  # 当 epoch 数达到里程碑时，减少学习率
+          begin=0,  # 开始更新学习率的步骤
+          end=20,  # 停止更新学习率的步骤
+          by_epoch=True,  # 是否按 epoch 更新学习率
+          milestones=[10, 15],  # 学习率衰减的步骤
+          gamma=0.1)  # 学习率衰减的乘法因子
+  ]
+
+  # 优化器
+  optim_wrapper = dict(
+      type='OptimWrapper',  # 优化器包装器的名称，切换到 AmpOptimWrapper 以启用混合精度训练
+      optimizer=dict(
+          type='SGD',  # 优化器的名称
+          lr=0.2,  # 学习率
+          momentum=0.9,  # 动量因子
+          weight_decay=0.0001),  # 权重衰减
+      clip_grad=dict(max_norm=40, norm_type=2))  # 梯度剪裁的配置
+
+  # 运行时设置
+  default_scope = 'mmaction'  # 默认注册表范围，用于查找模块。参考 https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
+  default_hooks = dict(
+      runtime_info=dict(type='RuntimeInfoHook'),  # 将运行时信息更新到消息中心的钩子
+      timer=dict(type='IterTimerHook'),  # 用于记录迭代过程中花费的时间的日志记录器
+      logger=dict(
+          type='LoggerHook',  # 用于记录训练/验证/测试阶段的日志的日志记录器
+          interval=20,  # 打印日志的间隔
+          ignore_last=False),  # 忽略每个 epoch 中最后几次迭代的日志
+      param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中的某些超参数的钩子
+      checkpoint=dict(
+          type='CheckpointHook',  # 定期保存权重的钩子
+          interval=3,  # 保存周期
+          save_best='auto',  # 在评估过程中测量最佳权重的指标
+          max_keep_ckpts=3),  # 保留的最大权重文件数量
+      sampler_seed=dict(type='DistSamplerSeedHook'),  # 用于分布式训练的数据加载采样器
+      sync_buffers=dict(type='SyncBuffersHook'))  # 在每个 epoch 结束时同步模型缓冲区的钩子
+  env_cfg = dict(
+      cudnn_benchmark=False,  # 是否启用 cudnn 的基准测试
+      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),  # 设置多进程的参数
+      dist_cfg=dict(backend='nccl'))  # 设置分布式环境的参数，也可以设置端口
+
+  log_processor = dict(
+      type='LogProcessor',  # 用于格式化日志信息的日志处理器
+      window_size=20,  # 默认平滑间隔
+      by_epoch=True)  # 是否使用 epoch 类型格式化日志
+  vis_backends = [
+      dict(type='LocalVisBackend')]  # 可视化后端的列表
+  visualizer = dict(
+      type='ActionVisualizer',  # 可视化器的名称
+      vis_backends=vis_backends)
+  log_level = 'INFO'  # 日志级别
+  load_from = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+              'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/'
+              'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth')  # 从给定路径加载模型权重作为预训练模型。这不会恢复训练。
+  resume = False  # 是否从 `load_from` 中定义的权重恢复训练。如果 `load_from` 为 None，则会从 `work_dir` 中恢复最新的权重。
+  ```
+
+### 动作定位的配置系统
+
+我们将模块化设计引入了配置系统中，方便进行各种实验。
+
+- BMN 的示例
+
+  为了帮助用户对完整的配置结构和动作定位系统中的模块有一个基本的了解，我们对 BMN 的配置进行了简要注释，具体如下所示。有关每个模块中每个参数的更详细用法和替代方法，请参阅 [API 文档](https://mmaction2.readthedocs.io/en/latest/api.html)。
+
+  ```python
+  # 模型设置
+  model = dict(
+      type='BMN',  # 定位器的类名
+      temporal_dim=100,  # 每个视频选取的总帧数
+      boundary_ratio=0.5,  # 确定视频边界的比率
+      num_samples=32,  # 每个 proposal 的采样数量
+      num_samples_per_bin=3,  # 每个采样的 bin 的采样数量
+      feat_dim=400,  # 特征的维度
+      soft_nms_alpha=0.4,  # Soft NMS 的 alpha 值
+      soft_nms_low_threshold=0.5,  # Soft NMS 的低阈值
+      soft_nms_high_threshold=0.9,  # Soft NMS 的高阈值
+      post_process_top_k=100)  # 后处理中的 top-k proposal 数量
+
+  # 数据集设置
+  dataset_type = 'ActivityNetDataset'  # 用于训练、验证和测试的数据集类型
+  data_root = 'data/activitynet_feature_cuhk/csv_mean_100/'  # 用于训练的数据的根目录
+  data_root_val = 'data/activitynet_feature_cuhk/csv_mean_100/'  # 用于验证和测试的数据的根目录
+  ann_file_train = 'data/ActivityNet/anet_anno_train.json'  # 用于训练的注释文件的路径
+  ann_file_val = 'data/ActivityNet/anet_anno_val.json'  # 用于验证的注释文件的路径
+  ann_file_test = 'data/ActivityNet/anet_anno_test.json'  # 用于测试的注释文件的路径
+
+  train_pipeline = [
+      dict(type='LoadLocalizationFeature'),  # 加载定位特征的管道
+      dict(type='GenerateLocalizationLabels'),  # 生成定位标签的管道
+      dict(
+          type='PackLocalizationInputs',  # 打包定位数据
+          keys=('gt_bbox'),  # 输入的键
+          meta_keys=('video_name'))]  # 输入的元键
+  val_pipeline = [
+      dict(type='LoadLocalizationFeature'),  # 加载定位特征的管道
+      dict(type='GenerateLocalizationLabels'),  # 生成定位标签的管道
+      dict(
+          type='PackLocalizationInputs',  # 打包定位数据
+          keys=('gt_bbox'),   # 输入的键
+          meta_keys=('video_name', 'duration_second', 'duration_frame',
+                     'annotations', 'feature_frame'))]  # 输入的元键
+  test_pipeline = [
+      dict(type='LoadLocalizationFeature'),  # 加载定位特征的管道
+      dict(
+          type='PackLocalizationInputs',  # 打包定位数据
+          keys=('gt_bbox'),  # 输入的键
+          meta_keys=('video_name', 'duration_second', 'duration_frame',
+                     'annotations', 'feature_frame'))]  # 输入的元键
+  train_dataloader = dict(
+      batch_size=8,  # 每个单 GPU 训练的批处理大小
+      num_workers=8,  # 每个单 GPU 训练时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程，这可以加快训练速度
+      sampler=dict(
+          type='DefaultSampler',  # 默认采样器，支持分布式和非分布式训练。参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py
+          shuffle=True),  # 在每个 epoch 中随机打乱训练数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_train,  # 注释文件的路径
+          data_prefix=dict(video=data_root),  # 视频路径的前缀
+          pipeline=train_pipeline)
+  )
+  val_dataloader = dict(
+      batch_size=1,  # 每个单 GPU 评估的批处理大小
+      num_workers=8,  # 每个单 GPU 评估时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 在验证和测试时不打乱数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          data_prefix=dict(video=data_root_val),  # 视频路径的前缀
+          pipeline=val_pipeline,
+          test_mode=True)
+  )
+  test_dataloader = dict(
+      batch_size=1,  # 每个单 GPU 测试的批处理大小
+      num_workers=8,  # 每个单 GPU 测试时预取数据的 worker 数量
+      persistent_workers=True,  # 如果为 `True`，则数据加载器在一个 epoch 结束后不会关闭 worker 进程
+      sampler=dict(
+          type='DefaultSampler',
+          shuffle=False),  # 在验证和测试时不打乱数据
+      dataset=dict(
+          type=dataset_type,
+          ann_file=ann_file_val,  # 注释文件的路径
+          data_prefix=dict(video=data_root_val),  # 视频路径的前缀
+          pipeline=test_pipeline,
+          test_mode=True)
+  )
+
+  # 评估设置
+  work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/'  # 保存当前实验的模型权重和日志的目录
+  val_evaluator = dict(
+      type='ANetMetric',
+      metric_type='AR@AN',
+      dump_config=dict(
+          out=f'{work_dir}/results.json',  # 输出文件的路径
+          output_format='json'))  # 输出文件的格式
+  test_evaluator = val_evaluator  # 将 test_evaluator 设置为 val_evaluator
+
+  max_epochs = 9  # 训练模型的总 epoch 数量
+  train_cfg = dict(
+      type='EpochBasedTrainLoop',  # 训练循环的名称
+      max_epochs=max_epochs,  # 总的训练 epoch 数量
+      val_begin=1,  # 开始验证的 epoch
+      val_interval=1)  # 验证的间隔
+  val_cfg = dict(
+      type='ValLoop')  # 验证循环的名称
+  test_cfg = dict(
+      type='TestLoop')  # 测试循环的名称
+
+  # 学习策略
+  param_scheduler = [
+      dict(
+          type='MultiStepLR',  # 当 epoch 数达到里程碑时，减少学习率
+          begin=0,  # 开始更新学习率的步骤
+          end=max_epochs,  # 停止更新学习率的步骤
+          by_epoch=True,  # 是否按 epoch 更新学习率
+          milestones=[7, ],  # 学习率衰减的步骤
+          gamma=0.1)  # 学习率衰减的乘法因子
+  ]
+
+  # 优化器
+  optim_wrapper = dict(
+      type='OptimWrapper',  # 优化器包装器的名称，切换到 AmpOptimWrapper 以启用混合精度训练
+      optimizer=dict(
+          type='Adam',  # 优化器的名称
+          lr=0.001,  # 学习率
+          weight_decay=0.0001),  # 权重衰减
+      clip_grad=dict(max_norm=40, norm_type=2))  # 梯度剪裁的配置
+
+  # 运行时设置
+  default_scope = 'mmaction'  # 默认注册表范围，用于查找模块。参考 https://mmengine.readthedocs.io/en/latest/tutorials/registry.html
+  default_hooks = dict(
+      runtime_info=dict(type='RuntimeInfoHook'),  # 将运行时信息更新到消息中心的钩子
+      timer=dict(type='IterTimerHook'),  # 用于记录迭代过程中花费的时间的日志记录器
+      logger=dict(
+          type='LoggerHook',  # 用于记录训练/验证/测试阶段的日志的日志记录器
+          interval=20,  # 打印日志的间隔
+          ignore_last=False),  # 忽略每个 epoch 中最后几次迭代的日志
+      param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中的某些超参数的钩子
+      checkpoint=dict(
+          type='CheckpointHook',  # 定期保存权重的钩子
+          interval=3,  # 保存周期
+          save_best='auto',  # 在评估过程中测量最佳权重的指标
+          max_keep_ckpts=3),  # 保留的最大权重文件数量
+      sampler_seed=dict(type='DistSamplerSeedHook'),  # 用于分布式训练的数据加载采样器
+      sync_buffers=dict(type='SyncBuffersHook'))  # 在每个 epoch 结束时同步模型缓冲区的钩子
+  env_cfg = dict(
+      cudnn_benchmark=False,  # 是否启用 cudnn 的基准测试
+      mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),  # 设置多进程的参数
+      dist_cfg=dict(backend='nccl'))  # 设置分布式环境的参数，也可以设置端口
+
+  log_processor = dict(
+      type='LogProcessor',  # 用于格式化日志信息的日志处理器
+      window_size=20,  # 默认平滑间隔
+      by_epoch=True)  # 是否使用 epoch 类型格式化日志
+  vis_backends = [
+      dict(type='LocalVisBackend')]  # 可视化后端的列表
+  visualizer = dict(
+      type='ActionVisualizer',  # 可视化器的名称
+      vis_backends=vis_backends)
+  log_level = 'INFO'  # 日志级别
+  load_from = None  # 从给定路径加载模型权重作为预训练模型。这不会恢复训练。
+  resume = False  # 是否从 `load_from` 中定义的权重恢复训练。如果 `load_from` 为 None，则会从 `work_dir` 中恢复最新的权重。
+  ```
diff --git a/docs/zh_cn/user_guides/finetune.md b/docs/zh_cn/user_guides/finetune.md
new file mode 100644
index 0000000000..0394791acc
--- /dev/null
+++ b/docs/zh_cn/user_guides/finetune.md
@@ -0,0 +1,320 @@
+# 模型微调
+
+本教程提供了使用预训练模型在其他数据集上进行微调的指导。通过微调，可以获得更好的性能。
+
+- [模型微调](#模型微调)
+  - [概述](#概述)
+  - [选择模板配置](#选择模板配置)
+  - [修改 Head](#修改-head)
+  - [修改数据集](#修改数据集)
+  - [修改训练计划](#修改训练计划)
+  - [使用预训练模型](#使用预训练模型)
+  - [开始训练](#开始训练)
+
+## 概述
+
+在新数据集上进行模型微调有两个步骤。
+
+1. 添加对新数据集的支持。请参考[准备数据集](prepare_dataset.md)和[自定义数据集](../advanced_guides/customize_dataset.md)。
+2. 修改配置文件。本教程将讨论这一部分。
+
+## 选择模板配置
+
+这里我们以 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 为例。我们首先将该配置文件复制到同一文件夹，并将其重命名为 `tsn_ucf101.py`，然后需要注意配置中的四个部分，具体来说，为不存在的键添加新键，并修改现有键的原始键。
+
+## 修改 Head
+
+`cls_head` 中的 `num_classes` 需要更改为新数据集的类别数。预训练模型的权重会被重用，除了最后的预测层。因此，更改类别数是安全的。在我们的例子中，UCF101 有 101 个类别。所以我们将其从 400（Kinetics-400 的类别数）改为 101。
+
+```python
+# model settings
+model = dict(
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101  # 将 400 修改为 101
+        ))
+```
+
+## 修改数据集
+
+MMAction2 支持 UCF101、Kinetics-400、Moments in Time、Multi-Moments in Time、THUMOS14、Something-Something V1&V2、ActivityNet 数据集。用户可能需要将上述其中一个数据集适应到他们的特殊数据集上。你可以参考[准备数据集](prepare_dataset.md)和[自定义数据集](../advanced_guides/customize_dataset.md)了解更多细节。在我们的例子中，UCF101 已经由各种数据集类型支持，例如 `VideoDataset`，因此我们将配置修改如下。
+
+```python
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/ucf101/videos_train/'
+data_root_val = 'data/ucf101/videos_val/'
+ann_file_train = 'data/ucf101/ucf101_train_list.txt'
+ann_file_val = 'data/ucf101/ucf101_val_list.txt'
+```
+
+## 修改训练计划
+
+微调通常需要较小的学习率和较少的训练周期。
+
+```python
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=50,  # 将 100 修改为 50
+    val_begin=1,
+    val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,  # 将 100 修改为 50
+        by_epoch=True,
+        milestones=[20, 40],  # 修改 milestones
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.005, # 将 0.01 修改为 0.005
+        momentum=0.9,
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+```
+
+## 使用预训练模型
+
+为了在整个网络上使用预训练模型，新配置文件在 `load_from` 中添加了预训练模型的链接。我们在 `configs/_base_/default_runtime.py` 中设置 `load_from=None` 作为默认值，并且根据[继承设计](config.md)，用户可以通过在其配置中设置 `load_from` 来直接更改它。
+
+```python
+# use the pre-trained model for the whole TSN network
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'  # 模型路径可以在模型库中找到
+```
+
+## 开始训练
+
+现在，我们已经完成了微调的配置文件，如下所示：
+
+```python
+_base_ = [
+    '../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py',
+    '../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101  # 将 400 修改为 101
+        ))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/ucf101/videos_train/'
+data_root_val = 'data/ucf101/videos_val/'
+ann_file_train = 'data/ucf101/ucf101_train_list.txt'
+ann_file_val = 'data/ucf101/ucf101_val_list.txt'
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=3,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='TenCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=50,  # 将 100 修改为 50
+    val_begin=1,
+    val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,  # 将 100 修改为 50
+        by_epoch=True,
+        milestones=[20, 40],  # 修改 milestones
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.005, # 将 0.01 修改为 0.005
+        momentum=0.9,
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (32 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=256)
+
+# use the pre-trained model for the whole TSN network
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+
+```
+
+另一种更简单的方法是继承 kinetics400 配置，并只指定修改的键。请确保自定义配置与 `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` 在同一个文件夹中。
+
+```python
+_base_ = [
+    'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'  # 继承模板配置
+]
+
+# model settings
+model = dict(
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101))  # 将 400 修改为 101
+
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/ucf101/videos_train/'
+data_root_val = 'data/ucf101/videos_val/'
+ann_file_train = 'data/ucf101/ucf101_train_list.txt'
+ann_file_val = 'data/ucf101/ucf101_val_list.txt'
+
+train_dataloader = dict(
+    dataset=dict(
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root)))
+val_dataloader = dict(
+    dataset=dict(
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val)))
+test_dataloader = dict(
+    dataset=dict(
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val)))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=50,  # 将 100 修改为 50
+    val_begin=1,
+    val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,  # 将 100 修改为 50
+        by_epoch=True,
+        milestones=[20, 40],  # 修改 milestones
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.005, # 将 0.01 修改为 0.005
+        momentum=0.9,
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# use the pre-trained model for the whole TSN network
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+
+```
+
+你可以使用以下命令在你的数据集上微调模型。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [可选参数]
+```
+
+例如：在确定性选项下，在 Kinetics-400 数据集上训练 TSN 模型。
+
+```shell
+python tools/train.py configs/recognition/tsn/tsn_ucf101.py  \
+  --seed=0 --deterministic
+```
+
+更多细节，请参考[训练和测试教程](train_test.md)中的**训练**部分。
diff --git a/docs/zh_cn/user_guides/inference.md b/docs/zh_cn/user_guides/inference.md
new file mode 100644
index 0000000000..8456f58264
--- /dev/null
+++ b/docs/zh_cn/user_guides/inference.md
@@ -0,0 +1,39 @@
+# 使用现有模型进行推理
+
+MMAction2 在[模型库](../modelzoo.md)中提供了预训练的视频理解模型。本文将展示如何使用现有模型对给定的视频进行推理。
+
+关于如何在标准数据集上测试现有模型，请参考这个[指南](./train_test.md#test)。
+
+## 对给定视频进行推理
+
+MMAction2 提供了用于对给定视频进行推理的高级 Python API：
+
+- [init_recognizer](mmaction.apis.init_recognizer): 使用配置文件和权重文件初始化一个识别器
+- [inference_recognizer](mmaction.apis.inference_recognizer): 对给定视频进行推理
+
+下面是一个使用 Kinitics-400 预训练权重构建模型并对给定视频进行推理的示例。
+
+```{note}
+如果您将 mmaction2 用作第三方包，您需要下载示例中的配置文件和演示视频。
+
+运行 'mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb --dest .' 下载所需的配置文件。
+
+运行 'wget https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.mp4' 下载所需的演示视频。
+```
+
+```python
+from mmaction.apis import inference_recognizer, init_recognizer
+
+config_path = 'configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'
+checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth' # 可以是本地路径
+img_path = 'demo/demo.mp4'   # 您可以指定自己的图片路径
+
+# 从配置文件和权重文件中构建模型
+model = init_recognizer(config_path, checkpoint_path, device="cpu")  # device 可以是 'cuda:0'
+# 对单个视频进行测试
+result = inference_recognizer(model, img_path)
+```
+
+`result` 是一个包含 `pred_scores` 的字典。
+
+示例中的动作识别演示可以在[demo/demo.py](https://github.com/open-mmlab/mmaction2/blob/main/demo/demo.py)中找到。
diff --git a/docs/zh_cn/user_guides/prepare_dataset.md b/docs/zh_cn/user_guides/prepare_dataset.md
new file mode 100644
index 0000000000..b8cdfee69b
--- /dev/null
+++ b/docs/zh_cn/user_guides/prepare_dataset.md
@@ -0,0 +1,253 @@
+# 准备数据集
+
+MMAction2 支持许多现有的数据集。在本章中，我们将引导您准备 MMAction2 的数据集。
+
+- [准备数据集](#准备数据集)
+  - [关于视频数据格式的说明](#关于视频数据格式的说明)
+  - [使用内置数据集](#使用内置数据集)
+  - [使用自定义数据集](#使用自定义数据集)
+    - [动作识别](#动作识别)
+    - [基于骨骼的动作识别](#基于骨骼的动作识别)
+    - [时空动作检测](#时空动作检测)
+    - [时序动作定位](#时序动作定位)
+  - [使用混合数据集进行训练](#使用混合数据集进行训练)
+    - [重复数据集](#重复数据集)
+  - [浏览数据集](#浏览数据集)
+
+## 关于视频数据格式的说明
+
+MMAction2 支持两种类型的数据格式：原始帧和视频。前者在之前的项目（如 [TSN](https://github.com/yjxiong/temporal-segment-networks)）中被广泛使用。当 SSD 可用时，这种方法运行速度很快，但无法满足日益增长的数据集需求（例如，最新的 [Kinetics](https://www.deepmind.com/open-source/kinetics) 数据集有 65 万个视频，总帧数将占用几 TB 的空间）。后者可以节省空间，但必须在执行时进行计算密集型的视频解码。为了加快视频解码速度，我们支持几种高效的视频加载库，如 [decord](https://github.com/zhreshold/decord)、[PyAV](https://github.com/PyAV-Org/PyAV) 等。
+
+## 使用内置数据集
+
+MMAction2 已经支持许多数据集，我们在路径 `$MMACTION2/tools/data/` 下提供了用于数据准备的 shell 脚本，请参考[支持的数据集](../datasetzoo_satatistics.md)以获取准备特定数据集的详细信息。
+
+## 使用自定义数据集
+
+最简单的方法是将您的数据集转换为现有的数据集格式：
+
+- `RawFrameDataset` 和 `VideoDataset` 用于[动作识别](#动作识别)
+- `PoseDataset` 用于[基于骨骼的动作识别](#基于骨骼的动作识别)
+- `AVADataset` 用于[时空动作检测](#时空动作检测)
+- `ActivityNetDataset` 用于[时序动作定位](#时序动作定位)
+
+在数据预处理之后，用户需要进一步修改配置文件以使用数据集。以下是在原始帧格式中使用自定义数据集的示例。
+
+在 `configs/task/method/my_custom_config.py` 中：
+
+```python
+...
+# 数据集设置
+dataset_type = 'RawframeDataset'
+data_root = 'path/to/your/root'
+data_root_val = 'path/to/your/root_val'
+ann_file_train = 'data/custom/custom_train_list.txt'
+ann_file_val = 'data/custom/custom_val_list.txt'
+ann_file_test = 'data/custom/custom_val_list.txt'
+...
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        ...),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        ...),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        ...))
+...
+```
+
+### 动作识别
+
+动作识别有两种类型的注释文件。
+
+- `RawFrameDataset` 的原始帧注释
+
+  原始帧数据集的注释是一个包含多行的文本文件，每一行表示一个视频的 `frame_directory`（相对路径）、视频的 `total_frames` 和视频的 `label`，用空格分隔。
+
+  以下是一个示例。
+
+  ```
+  some/directory-1 163 1
+  some/directory-2 122 1
+  some/directory-3 258 2
+  some/directory-4 234 2
+  some/directory-5 295 3
+  some/directory-6 121 3
+  ```
+
+- `VideoDataset` 的视频注释
+
+  视频数据集的注释是一个包含多行的文本文件，每一行表示一个样本视频，包括 `filepath`（相对路径）和 `label`，用空格分隔。
+
+  以下是一个示例。
+
+  ```
+  some/path/000.mp4 1
+  some/path/001.mp4 1
+  some/path/002.mp4 2
+  some/path/003.mp4 2
+  some/path/004.mp4 3
+  some/path/005.mp4 3
+  ```
+
+### 基于骨骼点的动作识别
+
+该任务基于骨骼序列（关键点的时间序列）识别动作类别。我们提供了一些方法来构建自定义的骨骼数据集。
+
+- 从 RGB 视频数据构建
+
+  您需要从视频中提取关键点数据，并将其转换为支持的格式。我们提供了一个[教程](https://github.com/open-mmlab/mmaction2/tree/main/configs/skeleton/posec3d/custom_dataset_training.md)，详细介绍了如何执行。
+
+- 从现有关键点数据构建
+
+  假设您已经有了 coco 格式的关键点数据，您可以将它们收集到一个 pickle 文件中。
+
+  每个 pickle 文件对应一个动作识别数据集。pickle 文件的内容是一个字典，包含两个字段：`split` 和 `annotations`
+
+  1. Split：`split` 字段的值是一个字典：键是拆分名称，值是属于特定剪辑的视频标识符列表。
+  2. Annotations：`annotations` 字段的值是一个骨骼注释列表，每个骨骼注释是一个字典，包含以下字段：
+     - `frame_dir`（str）：对应视频的标识符。
+     - `total_frames`（int）：此视频中的帧数。
+     - `img_shape`（tuple\[int\]）：视频帧的形状，一个包含两个元素的元组，格式为 `(height, width)`。仅对 2D 骨骼需要。
+     - `original_shape`（tuple\[int\]）：与 `img_shape` 相同。
+     - `label`（int）：动作标签。
+     - `keypoint`（np.ndarray，形状为 `[M x T x V x C]`）：关键点注释。
+       - M：人数；
+       - T：帧数（与 `total_frames` 相同）；
+       - V：关键点数量（NTURGB+D 3D 骨骼为 25，Coco 为 17，OpenPose 为 18 等）；
+       - C：关键点坐标的维数（2D 关键点为 C=2，3D 关键点为 C=3）。
+     - `keypoint_score`（np.ndarray，形状为 `[M x T x V]`）：关键点的置信度分数。仅对 2D 骨骼需要。
+
+  以下是一个示例：
+
+  ```
+  {
+      "split":
+          {
+              'xsub_train':
+                  ['S001C001P001R001A001', ...],
+              'xsub_val':
+                  ['S001C001P003R001A001', ...],
+              ...
+          }
+
+      "annotations:
+          [
+              {
+                  {
+                      'frame_dir': 'S001C001P001R001A001',
+                      'label': 0,
+                      'img_shape': (1080, 1920),
+                      'original_shape': (1080, 1920),
+                      'total_frames': 103,
+                      'keypoint': array([[[[1032. ,  334.8], ...]]])
+                      'keypoint_score': array([[[0.934 , 0.9766, ...]]])
+                  },
+                  {
+                      'frame_dir': 'S001C001P003R001A001',
+                      ...
+                  },
+                  ...
+
+              }
+          ]
+  }
+  ```
+
+  支持其他关键点格式需要进行进一步修改，请参考[自定义数据集](../advanced_guides/customize_dataset.md)。
+
+### 时空动作检测
+
+MMAction2 支持基于 `AVADataset` 的时空动作检测任务。注释包含真实边界框和提议边界框。
+
+- 真实边界框
+  真实边界框是一个包含多行的 csv 文件，每一行是一个帧的检测样本，格式如下：
+
+  video_identifier, time_stamp, lt_x, lt_y, rb_x, rb_y, label, entity_id
+  每个字段的含义如下：
+  `video_identifier`：对应视频的标识符
+  `time_stamp`：当前帧的时间戳
+  `lt_x`：左上角点的规范化 x 坐标
+  `lt_y`：左上角点的规范化 y 坐标
+  `rb_y`：右下角点的规范化 x 坐标
+  `rb_y`：右下角点的规范化 y 坐标
+  `label`：动作标签
+  `entity_id`：一个唯一的整数，允许将此框与该视频相邻帧中描绘同一个人的其他框连接起来
+
+  以下是一个示例：
+
+  ```
+  _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0
+  _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0
+  ...
+  ```
+
+- 提议边界框
+  提议边界框是由一个人体检测器生成的 pickle 文件，通常需要在目标数据集上进行微调。pickle 文件包含一个带有以下数据结构的字典：
+
+  `{'video_identifier,time_stamp': bbox_info}`
+
+  video_identifier（str）：对应视频的标识符
+  time_stamp（int）：当前帧的时间戳
+  bbox_info（np.ndarray，形状为`[n, 5]`）：检测到的边界框，\<x1> \<y1> \<x2> \<y2> \<score>。x1、x2、y1、y2 是相对于帧大小归一化的值，范围为 0.0-1.0。
+
+### 时序动作定位
+
+我们支持基于 `ActivityNetDataset` 的时序动作定位。ActivityNet 数据集的注释是一个 json 文件。每个键是一个视频名，相应的值是视频的元数据和注释。
+
+以下是一个示例：
+
+```
+{
+  "video1": {
+      "duration_second": 211.53,
+      "duration_frame": 6337,
+      "annotations": [
+          {
+              "segment": [
+                  30.025882995319815,
+                  205.2318595943838
+              ],
+              "label": "Rock climbing"
+          }
+      ],
+      "feature_frame": 6336,
+      "fps": 30.0,
+      "rfps": 29.9579255898
+  },
+  "video2": {...
+  }
+  ...
+}
+```
+
+## 使用混合数据集进行训练
+
+MMAction2 还支持混合数据集进行训练。目前，它支持重复数据集。
+
+### 重复数据集
+
+我们使用 `RepeatDataset` 作为包装器来重复数据集。例如，假设原始数据集为 `Dataset_A`，要重复它，配置如下所示
+
+```python
+dataset_A_train = dict(
+        type='RepeatDataset',
+        times=N,
+        dataset=dict(  # 这是 Dataset_A 的原始配置
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+## 浏览数据集
+
+即将推出...
diff --git a/docs/zh_cn/user_guides/train_test.md b/docs/zh_cn/user_guides/train_test.md
new file mode 100644
index 0000000000..fe4c3a3cb5
--- /dev/null
+++ b/docs/zh_cn/user_guides/train_test.md
@@ -0,0 +1,248 @@
+# 训练与测试
+
+- [训练与测试](#训练与测试)
+  - [训练](#训练)
+    - [使用单个 GPU 进行训练](#使用单个-gpu-进行训练)
+    - [使用多个 GPU 进行训练](#使用多个-gpu-进行训练)
+    - [使用多台机器进行训练](#使用多台机器进行训练)
+      - [同一网络中的多台机器](#同一网络中的多台机器)
+      - [使用 slurm 管理的多台机器](#使用-slurm-管理的多台机器)
+  - [测试](#测试)
+    - [使用单个 GPU 进行测试](#使用单个-gpu-进行测试)
+    - [使用多个 GPU 进行测试](#使用多个-gpu-进行测试)
+    - [使用多台机器进行测试](#使用多台机器进行测试)
+      - [同一网络中的多台机器](#同一网络中的多台机器-1)
+      - [使用 slurm 管理的多台机器](#使用-slurm-管理的多台机器-1)
+
+## 训练
+
+### 使用单个 GPU 进行训练
+
+您可以使用 `tools/train.py` 在一台带有 CPU 和 GPU(可选) 的单机上训练模型。
+
+下面是脚本的完整用法：
+
+```shell
+python tools/train.py ${CONFIG_FILE} [ARGS]
+```
+
+````{note}
+默认情况下，MMAction2 更倾向于使用 GPU 而不是 CPU 进行训练。如果您想在 CPU 上训练模型，请清空 `CUDA_VISIBLE_DEVICES` 或将其设置为 -1 以使 GPU 对程序不可见。
+
+```bash
+CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [ARGS]
+```
+````
+
+| 参数                                  | 描述                                                                                                                                                                |
+| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CONFIG_FILE`                         | 配置文件的路径。                                                                                                                                                    |
+| `--work-dir WORK_DIR`                 | 保存日志和权重的目标文件夹。默认为与配置文件相同名称的文件夹，位于 `./work_dirs` 下。                                                                               |
+| `--resume [RESUME]`                   | 恢复训练。如果指定了路径，则从该路径恢复，如果未指定，则尝试从最新的权重自动恢复。                                                                                  |
+| `--amp`                               | 启用自动混合精度训练。                                                                                                                                              |
+| `--no-validate`                       | **不建议使用**。在训练期间禁用权重评估。                                                                                                                            |
+| `--auto-scale-lr`                     | 根据实际批次大小和原始批次大小自动缩放学习率。                                                                                                                      |
+| `--seed`                              | 随机种子。                                                                                                                                                          |
+| `--diff-rank-seed`                    | 是否为不同的 rank 设置不同的种子。                                                                                                                                  |
+| `--deterministic`                     | 是否为 CUDNN 后端设置确定性选项。                                                                                                                                   |
+| `--cfg-options CFG_OPTIONS`           | 覆盖使用的配置中的某些设置，xxx=yyy 格式的键值对将合并到配置文件中。如果要覆盖的值是一个列表，则应采用 `key="[a,b]"` 或 `key=a,b` 的形式。该参数还允许嵌套的列表/元组值，例如 `key="[(a,b),(c,d)]"`。请注意，引号是必需的，且不允许有空格。 |
+| `--launcher {none,pytorch,slurm,mpi}` | 作业启动器的选项。默认为 `none`。                                                                                                                                   |
+
+### 使用多个 GPU 进行训练
+
+我们提供了一个 shell 脚本使用 `torch.distributed.launch` 来启动多个 GPU 的训练任务。
+
+```shell
+bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS]
+```
+
+| 参数       | 描述                                                                    |
+| ---------- | ----------------------------------------------------------------------- |
+| `CONFIG`   | 配置文件的路径。                                                        |
+| `GPUS`     | 要使用的 GPU 数量。                                                     |
+| `[PYARGS]` | `tools/train.py` 的其他可选参数，请参见[这里](#使用单个-gpu-进行训练)。 |
+
+您还可以通过环境变量来指定启动器的其他参数。例如，使用以下命令将启动器的通信端口更改为 29666：
+
+```shell
+PORT=29666 bash tools/dist_train.sh ${CONFIG} ${GPUS} [PY_ARGS]
+```
+
+如果您想启动多个训练作业并使用不同的 GPU，可以通过指定不同的端口和可见设备来启动它们。
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS]
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_train.sh ${CONFIG} 4 [PY_ARGS]
+```
+
+### 使用多台机器进行训练
+
+#### 同一网络中的多台机器
+
+如果您使用以太网连接的多台机器启动训练作业，可以运行以下命令：
+
+在第一台机器上：
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS
+```
+
+在第二台机器上：
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS
+```
+
+需要指定以下额外的环境变量来训练或测试多台机器上的模型：
+
+| ENV_VARS      | 描述                                                             |
+| ------------- | ---------------------------------------------------------------- |
+| `NNODES`      | 机器的总数。默认为 1。                                           |
+| `NODE_RANK`   | 本地机器的索引。默认为 0。                                       |
+| `PORT`        | 通信端口，在所有机器上应该保持一致。默认为 29500。               |
+| `MASTER_ADDR` | 主机器的 IP 地址，在所有机器上应该保持一致。默认为 `127.0.0.1`。 |
+
+通常，如果您没有高速网络（如 InfiniBand），则速度会比较慢。
+
+#### 使用 slurm 管理的多台机器
+
+如果您在使用 [slurm](https://slurm.schedmd.com/) 管理的集群上运行 MMAction2，可以使用脚本 `slurm_train.sh`。
+
+```shell
+[ENV_VARS] bash tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG} [PY_ARGS]
+```
+
+下面是该脚本的参数描述。
+
+| 参数        | 描述                                                                    |
+| ----------- | ----------------------------------------------------------------------- |
+| `PARTITION` | 集群中要使用的分区。                                                    |
+| `JOB_NAME`  | 作业的名称，您可以自定义。                                              |
+| `CONFIG`    | 配置文件的路径。                                                        |
+| `[PYARGS]`  | `tools/train.py` 的其他可选参数，请参见[这里](#使用单个-gpu-进行训练)。 |
+
+下面列出了可用于配置 slurm 作业的环境变量。
+
+| ENV_VARS        | 描述                                                                             |
+| --------------- | -------------------------------------------------------------------------------- |
+| `GPUS`          | 要使用的 GPU 数量。默认为 8。                                                    |
+| `GPUS_PER_NODE` | 每个节点要分配的 GPU 数量。默认为 8。                                            |
+| `CPUS_PER_TASK` | 每个任务要分配的 CPU 数量（通常一个 GPU 对应一个任务）。默认为 5。               |
+| `SRUN_ARGS`     | `srun` 的其他参数。可用选项可在[这里](https://slurm.schedmd.com/srun.html)找到。 |
+
+## 测试
+
+### 使用单个 GPU 进行测试
+
+您可以使用 `tools/test.py` 在一台带有 CPU 和可选 GPU 的单机上测试模型。
+
+下面是脚本的完整用法：
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS]
+```
+
+````{note}
+默认情况下，MMAction2 更倾向于使用 GPU 而不是 CPU 进行测试。如果您想在 CPU 上测试模型，请清空 `CUDA_VISIBLE_DEVICES` 或将其设置为 -1 以使 GPU 对程序不可见。
+
+```bash
+CUDA_VISIBLE_DEVICES=-1 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [ARGS]
+```
+````
+
+| 参数                                  | 描述                                                                                                                                                                |
+| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CONFIG_FILE`                         | 配置文件的路径。                                                                                                                                                    |
+| `CHECKPOINT_FILE`                     | 权重文件的路径（可以是 HTTP 链接）。                                                                                                                                |
+| `--work-dir WORK_DIR`                 | 保存包含评估指标的文件的目录。默认为与配置文件相同名称的文件夹，位于 `./work_dirs` 下。                                                                             |
+| `--dump DUMP`                         | 存储模型的所有输出以进行离线评估的路径。                                                                                                                            |
+| `--cfg-options CFG_OPTIONS`           | 覆盖使用的配置中的某些设置，xxx=yyy 格式的键值对将合并到配置文件中。如果要覆盖的值是一个列表，则应采用 `key="[a,b]"` 或 `key=a,b` 的形式。该参数还允许嵌套的列表/元组值，例如 `key="[(a,b),(c,d)]"`。请注意，引号是必需的，且不允许有空格。 |
+| `--show-dir SHOW_DIR`                 | 保存结果可视化图片的目录。                                                                                                                                          |
+| `--show`                              | 在窗口中可视化预测结果。                                                                                                                                            |
+| `--interval INTERVAL`                 | 可视化的样本间隔。默认为 1。                                                                                                                                        |
+| `--wait-time WAIT_TIME`               | 每个窗口的显示时间（单位：秒）。默认为 2。                                                                                                                          |
+| `--launcher {none,pytorch,slurm,mpi}` | 作业启动器的选项。默认为 `none`。                                                                                                                                   |
+
+### 使用多个 GPU 进行测试
+
+我们提供了一个 shell 脚本使用 `torch.distributed.launch` 来启动多个 GPU 的测试任务。
+
+```shell
+bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS]
+```
+
+| 参数         | 描述                                                                   |
+| ------------ | ---------------------------------------------------------------------- |
+| `CONFIG`     | 配置文件的路径。                                                       |
+| `CHECKPOINT` | 权重文件的路径（可以是 HTTP 链接）。                                   |
+| `GPUS`       | 要使用的 GPU 数量。                                                    |
+| `[PYARGS]`   | `tools/test.py` 的其他可选参数，请参见[这里](#使用单个-gpu-进行测试)。 |
+
+您还可以通过环境变量来指定启动器的其他参数。例如，使用以下命令将启动器的通信端口更改为 29666：
+
+```shell
+PORT=29666 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} ${GPUS} [PY_ARGS]
+```
+
+如果您想启动多个测试作业并使用不同的 GPU，可以通过指定不同的端口和可见设备来启动它们。
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS]
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_test.sh ${CONFIG} ${CHECKPOINT} 4 [PY_ARGS]
+```
+
+### 使用多台机器进行测试
+
+#### 同一网络中的多台机器
+
+如果您使用以太网连接的多台机器进行测试作业，可以运行以下命令：
+
+在第一台机器上：
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS
+```
+
+在第二台机器上：
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_test.sh $CONFIG $CHECKPOINT $GPUS
+```
+
+与单台机器上的多个 GPU 相比，您需要指定一些额外的环境变量：
+
+| ENV_VARS      | 描述                                                             |
+| ------------- | ---------------------------------------------------------------- |
+| `NNODES`      | 机器的总数。默认为 1。                                           |
+| `NODE_RANK`   | 本地机器的索引。默认为 0。                                       |
+| `PORT`        | 通信端口，在所有机器上应该保持一致。默认为 29500。               |
+| `MASTER_ADDR` | 主机器的 IP 地址，在所有机器上应该保持一致。默认为 `127.0.0.1`。 |
+
+通常，如果您没有高速网络（如 InfiniBand），则速度会比较慢。
+
+#### 使用 slurm 管理的多台机器
+
+如果您在使用 [slurm](https://slurm.schedmd.com/) 管理的集群上运行 MMAction2，可以使用脚本 `slurm_test.sh`。
+
+```shell
+[ENV_VARS] bash tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG} ${CHECKPOINT} [PY_ARGS]
+```
+
+下面是该脚本的参数描述。
+
+| 参数         | 描述                                                                   |
+| ------------ | ---------------------------------------------------------------------- |
+| `PARTITION`  | 集群中要使用的分区。                                                   |
+| `JOB_NAME`   | 作业的名称，您可以自定义。                                             |
+| `CONFIG`     | 配置文件的路径。                                                       |
+| `CHECKPOINT` | 权重文件的路径（可以是 HTTP 链接）。                                   |
+| `[PYARGS]`   | `tools/test.py` 的其他可选参数，请参见[这里](#使用单个-gpu-进行测试)。 |
+
+下面列出了可用于配置 slurm 作业的环境变量。
+
+| ENV_VARS        | 描述                                                                             |
+| --------------- | -------------------------------------------------------------------------------- |
+| `GPUS`          | 要使用的 GPU 数量。默认为 8。                                                    |
+| `GPUS_PER_NODE` | 每个节点要分配的 GPU 数量。默认为 8。                                            |
+| `CPUS_PER_TASK` | 每个任务要分配的 CPU 数量（通常一个 GPU 对应一个任务）。默认为 5。               |
+| `SRUN_ARGS`     | `srun` 的其他参数。可用选项可在[这里](https://slurm.schedmd.com/srun.html)找到。 |
diff --git a/docs/zh_cn/user_guides/useful_tools.md b/docs/zh_cn/user_guides/useful_tools.md
deleted file mode 100644
index e16e41793c..0000000000
--- a/docs/zh_cn/user_guides/useful_tools.md
+++ /dev/null
@@ -1 +0,0 @@
-# 其他实用工具（内容建设中）
diff --git a/docs/zh_cn/user_guides/visualization.md b/docs/zh_cn/user_guides/visualization.md
deleted file mode 100644
index 9d1aa2a2e7..0000000000
--- a/docs/zh_cn/user_guides/visualization.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# 可视化工具
-
-## 对数据集可视化
-
-你可以使用`tools/analysis_tools/browse_dataset.py`去可视化数据集。
-
-```shell
-python tools/analysis_tools/browse_dataset.py ${CONFIG_FILE} [ARGS]
-```
-
-| 参数                            | 含义                                                                                                                                                                      |
-| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `CONFIG_FILE`                   | 配置文件的路径。                                                                                                                                                          |
-| `--output-dir OUTPUT_DIR`       | 如果没有display显示接口，你能将可视化结果保存到`OUTPUT_DIR`，默认为None。                                                                                                 |
-| `--show-frames`                 | 如果你拥有显示接口，会显示视频的帧内容，默认为False。                                                                                                                     |
-| `--phase PHASE`                 | 想要可视化的数据集阶段，接受`train`, `test` 和`val`. 默认为`train`。                                                                                                      |
-| `--show-number SHOW_NUMBER`     | 选择可视化的图像数量，必须比0大，如果数量比数据集长度更大，则展示数据集中的所有图像，默认为"sys.maxsize"，展示数据集中所有图像。                                          |
-| `--show-interval SHOW_INTERVAL` | 显示的间隔，默认为2The interval of show (s). Defaults to 2。                                                                                                              |
-| `--mode MODE`                   | 显示模式：显示原始视频或者变换后的视频。`original` 表示显示从硬盘中导入的视频，而`transformed` 表示显示变换后的视频，默认为`transformed`。                                |
-| `--cfg-options CFG_OPTIONS`     | 覆盖一些正在使用的config配置的设置，像”xxx=yyy“形式的键值对将会被合并进config配置文件。如果将被覆盖的是一个列表，它的形式将是`key="[a,b]"` 或 `key=a,b`的格式。该参数还允许嵌套列表/元组值，例如`key="[(a,b),(c,d)]"`. 请注意，引号是必需的，不允许有空格。 |
diff --git a/docs/zh_cn/utils.py b/docs/zh_cn/utils.py
new file mode 100644
index 0000000000..a3ad213730
--- /dev/null
+++ b/docs/zh_cn/utils.py
@@ -0,0 +1,28 @@
+import re
+from pathlib import Path
+
+
+def replace_link(pattern, template, content, file_path):
+    MMACT_ROOT = Path(__file__).absolute().parents[2]
+    GITHUB_PREFIX = 'https://github.com/open-mmlab/mmaction2/blob/main/'
+
+    def replace_core(matchobj):
+        name = matchobj.group(1)
+        link = matchobj.group(2)
+        if link.startswith('http') or link.startswith('#'):
+            return template.format(name, link)
+        # For link relative to project folder, such as '/configs/*/*.py'
+        elif Path(link).is_absolute():
+            link = link.lstrip('/')
+            folder = MMACT_ROOT
+        # For link relative to current file, such as './config/*.py'
+        else:
+            folder = file_path.parent
+        file_link = link.split('#')[0]
+        assert (folder / file_link).exists(), \
+            f'Link not found:\n{file_path}: {folder / link}'
+        rel_link = (folder / link).resolve().relative_to(MMACT_ROOT)
+        link = GITHUB_PREFIX + str(rel_link)
+        return template.format(name, link)
+
+    return re.sub(pattern, replace_core, content)
diff --git a/mmaction/apis/inference.py b/mmaction/apis/inference.py
index d0a4c01501..3561101e00 100644
--- a/mmaction/apis/inference.py
+++ b/mmaction/apis/inference.py
@@ -39,7 +39,8 @@ def init_recognizer(config: Union[str, Path, mmengine.Config],
 
     init_default_scope(config.get('default_scope', 'mmaction'))
 
-    if config.model.backbone.get('pretrained', None):
+    if hasattr(config.model, 'backbone') and config.model.backbone.get(
+            'pretrained', None):
         config.model.backbone.pretrained = None
     model = MODELS.build(config.model)
 
@@ -72,6 +73,7 @@ def inference_recognizer(model: nn.Module,
 
     if test_pipeline is None:
         cfg = model.cfg
+        init_default_scope(cfg.get('default_scope', 'mmaction'))
         test_pipeline_cfg = cfg.test_pipeline
         test_pipeline = Compose(test_pipeline_cfg)
 
@@ -99,7 +101,8 @@ def inference_recognizer(model: nn.Module,
     return result
 
 
-def detection_inference(det_config: Union[str, Path, mmengine.Config],
+def detection_inference(det_config: Union[str, Path, mmengine.Config,
+                                          nn.Module],
                         det_checkpoint: str,
                         frame_paths: List[str],
                         det_score_thr: float = 0.9,
@@ -109,8 +112,10 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config],
     """Detect human boxes given frame paths.
 
     Args:
-        det_config (Union[str, :obj:`Path`, :obj:`mmengine.Config`]): Config
-            file path, :obj:`Path` or the config object.
+        det_config (Union[str, :obj:`Path`, :obj:`mmengine.Config`,
+            :obj:`torch.nn.Module`]):
+            Det config file path or Detection model object. It can be
+            a :obj:`Path`, a config object, or a module object.
         det_checkpoint: Checkpoint path/url.
         frame_paths (List[str]): The paths of frames to do detection inference.
         det_score_thr (float): The threshold of human detection score.
@@ -133,9 +138,11 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config],
         raise ImportError('Failed to import `inference_detector` and '
                           '`init_detector` from `mmdet.apis`. These apis are '
                           'required in this inference api! ')
-
-    model = init_detector(
-        config=det_config, checkpoint=det_checkpoint, device=device)
+    if isinstance(det_config, nn.Module):
+        model = det_config
+    else:
+        model = init_detector(
+            config=det_config, checkpoint=det_checkpoint, device=device)
 
     results = []
     data_samples = []
@@ -160,7 +167,7 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config],
     return results, data_samples
 
 
-def pose_inference(pose_config: Union[str, Path, mmengine.Config],
+def pose_inference(pose_config: Union[str, Path, mmengine.Config, nn.Module],
                    pose_checkpoint: str,
                    frame_paths: List[str],
                    det_results: List[np.ndarray],
@@ -168,8 +175,10 @@ def pose_inference(pose_config: Union[str, Path, mmengine.Config],
     """Perform Top-Down pose estimation.
 
     Args:
-        pose_config (Union[str, :obj:`Path`, :obj:`mmengine.Config`]): Config
-            file path, :obj:`Path` or the config object.
+        pose_config (Union[str, :obj:`Path`, :obj:`mmengine.Config`,
+            :obj:`torch.nn.Module`]): Pose config file path or
+            pose model object. It can be a :obj:`Path`, a config object,
+            or a module object.
         pose_checkpoint: Checkpoint path/url.
         frame_paths (List[str]): The paths of frames to do pose inference.
         det_results (List[np.ndarray]): List of detected human boxes.
@@ -188,8 +197,10 @@ def pose_inference(pose_config: Union[str, Path, mmengine.Config],
         raise ImportError('Failed to import `inference_topdown` and '
                           '`init_model` from `mmpose.apis`. These apis '
                           'are required in this inference api! ')
-
-    model = init_model(pose_config, pose_checkpoint, device)
+    if isinstance(pose_config, nn.Module):
+        model = pose_config
+    else:
+        model = init_model(pose_config, pose_checkpoint, device)
 
     results = []
     data_samples = []
diff --git a/mmaction/apis/inferencers/actionrecog_inferencer.py b/mmaction/apis/inferencers/actionrecog_inferencer.py
index 9bfb3af7dd..f45f137b59 100644
--- a/mmaction/apis/inferencers/actionrecog_inferencer.py
+++ b/mmaction/apis/inferencers/actionrecog_inferencer.py
@@ -12,7 +12,7 @@
 
 from mmaction.registry import INFERENCERS
 from mmaction.structures import ActionDataSample
-from mmaction.utils import ConfigType
+from mmaction.utils import ConfigType, get_str_type
 
 InstanceList = List[InstanceData]
 InputType = Union[str, np.ndarray]
@@ -167,34 +167,35 @@ def _init_pipeline(self, cfg: ConfigType) -> Compose:
         # Alter data pipelines for decode
         if self.input_format == 'array':
             for i in range(len(test_pipeline)):
-                if 'Decode' in test_pipeline[i]['type']:
+                if 'Decode' in get_str_type(test_pipeline[i]['type']):
                     test_pipeline[i] = dict(type='ArrayDecode')
             test_pipeline = [
                 x for x in test_pipeline if 'Init' not in x['type']
             ]
         elif self.input_format == 'video':
-            if 'Init' not in test_pipeline[0]['type']:
+            if 'Init' not in get_str_type(test_pipeline[0]['type']):
                 test_pipeline = [dict(type='DecordInit')] + test_pipeline
             else:
                 test_pipeline[0] = dict(type='DecordInit')
             for i in range(len(test_pipeline)):
-                if 'Decode' in test_pipeline[i]['type']:
+                if 'Decode' in get_str_type(test_pipeline[i]['type']):
                     test_pipeline[i] = dict(type='DecordDecode')
         elif self.input_format == 'rawframes':
-            if 'Init' in test_pipeline[0]['type']:
+            if 'Init' in get_str_type(test_pipeline[0]['type']):
                 test_pipeline = test_pipeline[1:]
             for i in range(len(test_pipeline)):
-                if 'Decode' in test_pipeline[i]['type']:
+                if 'Decode' in get_str_type(test_pipeline[i]['type']):
                     test_pipeline[i] = dict(type='RawFrameDecode')
         # Alter data pipelines to close TTA, avoid OOM
         # Use center crop instead of multiple crop
         for i in range(len(test_pipeline)):
-            if test_pipeline[i]['type'] in ['ThreeCrop', 'TenCrop']:
+            if get_str_type(
+                    test_pipeline[i]['type']) in ['ThreeCrop', 'TenCrop']:
                 test_pipeline[i]['type'] = 'CenterCrop'
         # Use single clip for `Recognizer3D`
         if cfg.model.type == 'Recognizer3D':
             for i in range(len(test_pipeline)):
-                if test_pipeline[i]['type'] == 'SampleFrames':
+                if get_str_type(test_pipeline[i]['type']) == 'SampleFrames':
                     test_pipeline[i]['num_clips'] = 1
         # Pack multiple types of input format
         test_pipeline.insert(
diff --git a/mmaction/configs/_base_/__init__.py b/mmaction/configs/_base_/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/mmaction/configs/_base_/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/_base_/default_runtime.py b/mmaction/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000..7f261ea0a1
--- /dev/null
+++ b/mmaction/configs/_base_/default_runtime.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook, RuntimeInfoHook,
+                            SyncBuffersHook)
+from mmengine.runner import LogProcessor
+
+from mmaction.visualization import ActionVisualizer, LocalVisBackend
+
+default_scope = 'mmaction'
+
+default_hooks = dict(
+    runtime_info=dict(type=RuntimeInfoHook),
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=20, ignore_last=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, interval=1, save_best='auto'),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    sync_buffers=dict(type=SyncBuffersHook))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type=LogProcessor, window_size=20, by_epoch=True)
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(type=ActionVisualizer, vis_backends=vis_backends)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/mmaction/configs/_base_/models/__init__.py b/mmaction/configs/_base_/models/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/mmaction/configs/_base_/models/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/_base_/models/slowfast_r50.py b/mmaction/configs/_base_/models/slowfast_r50.py
new file mode 100644
index 0000000000..c7d2051854
--- /dev/null
+++ b/mmaction/configs/_base_/models/slowfast_r50.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             ResNet3dSlowFast, SlowFastHead)
+
+# model settings
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=ResNet3dSlowFast,
+        pretrained=None,
+        resample_rate=8,  # tau
+        speed_ratio=8,  # alpha
+        channel_ratio=8,  # beta_inv
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            norm_eval=False),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            norm_eval=False)),
+    cls_head=dict(
+        type=SlowFastHead,
+        in_channels=2304,  # 2048+256
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
diff --git a/mmaction/configs/_base_/models/slowonly_r50.py b/mmaction/configs/_base_/models/slowonly_r50.py
new file mode 100644
index 0000000000..b6e66706fd
--- /dev/null
+++ b/mmaction/configs/_base_/models/slowonly_r50.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.models import (ActionDataPreprocessor, I3DHead, Recognizer3D,
+                             ResNet3dSlowOnly)
+
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=ResNet3dSlowOnly,
+        depth=50,
+        pretrained='https://download.pytorch.org/models/resnet50-11ad3fa6.pth',
+        lateral=False,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    cls_head=dict(
+        type=I3DHead,
+        in_channels=2048,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
diff --git a/mmaction/configs/_base_/models/swin_tiny.py b/mmaction/configs/_base_/models/swin_tiny.py
new file mode 100644
index 0000000000..bd27f02e3d
--- /dev/null
+++ b/mmaction/configs/_base_/models/swin_tiny.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmaction.models import (ActionDataPreprocessor, I3DHead, Recognizer3D,
+                             SwinTransformer3D)
+
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=SwinTransformer3D,
+        arch='tiny',
+        pretrained=None,
+        pretrained2d=True,
+        patch_size=(2, 4, 4),
+        window_size=(8, 7, 7),
+        mlp_ratio=4.,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        patch_norm=True),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    cls_head=dict(
+        type=I3DHead,
+        in_channels=768,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips='prob'))
diff --git a/mmaction/configs/recognition/slowfast/__init__.py b/mmaction/configs/recognition/slowfast/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/mmaction/configs/recognition/slowfast/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/recognition/slowfast/slowfast_r50_8xb8_4x16x1_256e_kinetics400_rgb.py b/mmaction/configs/recognition/slowfast/slowfast_r50_8xb8_4x16x1_256e_kinetics400_rgb.py
new file mode 100644
index 0000000000..b8c0e11cf7
--- /dev/null
+++ b/mmaction/configs/recognition/slowfast/slowfast_r50_8xb8_4x16x1_256e_kinetics400_rgb.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.models.slowfast_r50 import *
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               RandomResizedCrop, Resize, SampleFrames,
+                               ThreeCrop, VideoDataset)
+from mmaction.evaluation import AccMetric
+
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=ThreeCrop, crop_size=256),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=256, val_begin=1, val_interval=5)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+optim_wrapper = dict(
+    optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=1e-4),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=34,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=256,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=256)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=4, max_keep_ckpts=3),
+        logger=dict(interval=100)))
diff --git a/mmaction/configs/recognition/slowonly/__init__.py b/mmaction/configs/recognition/slowonly/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/mmaction/configs/recognition/slowonly/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/recognition/slowonly/slowonly_r50_8xb16_4x16x1_256e_kinetics400_rgb.py b/mmaction/configs/recognition/slowonly/slowonly_r50_8xb16_4x16x1_256e_kinetics400_rgb.py
new file mode 100644
index 0000000000..75393789f5
--- /dev/null
+++ b/mmaction/configs/recognition/slowonly/slowonly_r50_8xb16_4x16x1_256e_kinetics400_rgb.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.models.slowonly_r50 import *
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               RandomResizedCrop, Resize, SampleFrames,
+                               ThreeCrop, VideoDataset)
+from mmaction.evaluation import AccMetric
+
+# model settings
+model.update(dict(backbone=dict(pretrained=None)))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=SampleFrames, clip_len=4, frame_interval=16, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=4,
+        frame_interval=16,
+        num_clips=1,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=4,
+        frame_interval=16,
+        num_clips=10,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=ThreeCrop, crop_size=256),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=256, val_begin=1, val_interval=5)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning policy
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.1, by_epoch=True, begin=0, end=34),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=222,
+        eta_min=0,
+        by_epoch=True,
+        begin=34,
+        end=256)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type=SGD, lr=0.2, momentum=0.9, weight_decay=1e-4),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# runtime settings
+default_hooks.update(dict(checkpoint=dict(interval=4, max_keep_ckpts=3)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/mmaction/configs/recognition/swin/__init__.py b/mmaction/configs/recognition/swin/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/mmaction/configs/recognition/swin/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/recognition/swin/swin_base_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py b/mmaction/configs/recognition/swin/swin_base_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
new file mode 100644
index 0000000000..187ebf4a62
--- /dev/null
+++ b/mmaction/configs/recognition/swin/swin_base_p244_w877_in1k_pre_8xb8_amp_32x2x1_30e_kinetics400_rgb.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.models.swin_tiny import *
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               RandomResizedCrop, Resize, SampleFrames,
+                               ThreeCrop, VideoDataset)
+from mmaction.engine import SwinOptimWrapperConstructor
+from mmaction.evaluation import AccMetric
+
+model.update(
+    dict(
+        backbone=dict(
+            arch='base',
+            drop_path_rate=0.3,
+            pretrained=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_base_patch4_window7_224.pth'  # noqa: E501
+        ),
+        cls_head=dict(in_channels=1024)))
+
+# dataset settings
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=SampleFrames, clip_len=32, frame_interval=2, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(
+        type=SampleFrames,
+        clip_len=32,
+        frame_interval=2,
+        num_clips=4,
+        test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=30, val_begin=1, val_interval=3)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+optim_wrapper = dict(
+    type=AmpOptimWrapper,
+    optimizer=dict(type=AdamW, lr=1e-3, betas=(0.9, 0.999), weight_decay=0.05),
+    constructor=SwinOptimWrapperConstructor,
+    paramwise_cfg=dict(
+        absolute_pos_embed=dict(decay_mult=0.),
+        relative_position_bias_table=dict(decay_mult=0.),
+        norm=dict(decay_mult=0.),
+        backbone=dict(lr_mult=0.1)))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=2.5,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=30,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=30)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/mmaction/configs/recognition/uniformerv2/__init__.py b/mmaction/configs/recognition/uniformerv2/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2_base_p16_res224_clip_kinetics710_pre_u8_kinetics400_rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2_base_p16_res224_clip_kinetics710_pre_u8_kinetics400_rgb.py
new file mode 100644
index 0000000000..0aa5db0d8a
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2_base_p16_res224_clip_kinetics710_pre_u8_kinetics400_rgb.py
@@ -0,0 +1,185 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type=UniFormerHead,
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=768,
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k400.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/mmaction/configs/skeleton/posec3d/__init__.py b/mmaction/configs/skeleton/posec3d/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/mmaction/configs/skeleton/posec3d/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmaction/configs/skeleton/posec3d/slowonly_r50_8xb16_u48_240e_ntu60_xsub_limb.py b/mmaction/configs/skeleton/posec3d/slowonly_r50_8xb16_u48_240e_ntu60_xsub_limb.py
new file mode 100644
index 0000000000..80399bd984
--- /dev/null
+++ b/mmaction/configs/skeleton/posec3d/slowonly_r50_8xb16_u48_240e_ntu60_xsub_limb.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler, RepeatDataset
+from mmengine.optim import CosineAnnealingLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmaction.datasets import (CenterCrop, Flip, FormatShape,
+                               GeneratePoseTarget, PackActionInputs,
+                               PoseCompact, PoseDataset, PoseDecode,
+                               RandomResizedCrop, Resize, UniformSampleFrames)
+from mmaction.evaluation import AccMetric
+from mmaction.models import I3DHead, Recognizer3D, ResNet3dSlowOnly
+
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=ResNet3dSlowOnly,
+        depth=50,
+        pretrained=None,
+        in_channels=17,
+        base_channels=32,
+        num_stages=3,
+        out_indices=(2, ),
+        stage_blocks=(4, 6, 3),
+        conv1_stride_s=1,
+        pool1_stride_s=1,
+        inflate=(0, 1, 1),
+        spatial_strides=(2, 2, 2),
+        temporal_strides=(1, 1, 2),
+        dilations=(1, 1, 1)),
+    cls_head=dict(
+        type=I3DHead,
+        in_channels=512,
+        num_classes=60,
+        dropout_ratio=0.5,
+        average_clips='prob'))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
+right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
+skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11],
+             [11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2],
+             [1, 3], [2, 4], [11, 12]]
+left_limb = [0, 2, 3, 6, 7, 8, 12, 14]
+right_limb = [1, 4, 5, 9, 10, 11, 13, 15]
+train_pipeline = [
+    dict(type=UniformSampleFrames, clip_len=48),
+    dict(type=PoseDecode),
+    dict(type=PoseCompact, hw_ratio=1., allow_imgpad=True),
+    dict(type=Resize, scale=(-1, 64)),
+    dict(type=RandomResizedCrop, area_range=(0.56, 1.0)),
+    dict(type=Resize, scale=(56, 56), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp),
+    dict(
+        type=GeneratePoseTarget,
+        sigma=0.6,
+        use_score=True,
+        with_kp=False,
+        with_limb=True,
+        skeletons=skeletons),
+    dict(type=FormatShape, input_format='NCTHW_Heatmap'),
+    dict(type=PackActionInputs)
+]
+val_pipeline = [
+    dict(type=UniformSampleFrames, clip_len=48, num_clips=1, test_mode=True),
+    dict(type=PoseDecode),
+    dict(type=PoseCompact, hw_ratio=1., allow_imgpad=True),
+    dict(type=Resize, scale=(-1, 64)),
+    dict(type=CenterCrop, crop_size=64),
+    dict(
+        type=GeneratePoseTarget,
+        sigma=0.6,
+        use_score=True,
+        with_kp=False,
+        with_limb=True,
+        skeletons=skeletons),
+    dict(type=FormatShape, input_format='NCTHW_Heatmap'),
+    dict(type=PackActionInputs)
+]
+test_pipeline = [
+    dict(type=UniformSampleFrames, clip_len=48, num_clips=10, test_mode=True),
+    dict(type=PoseDecode),
+    dict(type=PoseCompact, hw_ratio=1., allow_imgpad=True),
+    dict(type=Resize, scale=(-1, 64)),
+    dict(type=CenterCrop, crop_size=64),
+    dict(
+        type=GeneratePoseTarget,
+        sigma=0.6,
+        use_score=True,
+        with_kp=False,
+        with_limb=True,
+        skeletons=skeletons,
+        double=True,
+        left_limb=left_limb,
+        right_limb=right_limb),
+    dict(type=FormatShape, input_format='NCTHW_Heatmap'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=10,
+        dataset=dict(
+            type=PoseDataset,
+            ann_file=ann_file,
+            split='xsub_train',
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=PoseDataset,
+        ann_file=ann_file,
+        split='xsub_val',
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=PoseDataset,
+        ann_file=ann_file,
+        split='xsub_val',
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = [dict(type=AccMetric)]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=24, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        eta_min=0,
+        T_max=24,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type=SGD, lr=0.2, momentum=0.9, weight_decay=0.0003),
+    clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/mmaction/datasets/__init__.py b/mmaction/datasets/__init__.py
index 0b34a72fc0..ded946b727 100644
--- a/mmaction/datasets/__init__.py
+++ b/mmaction/datasets/__init__.py
@@ -8,9 +8,10 @@
 from .repeat_aug_dataset import RepeatAugDataset, repeat_pseudo_collate
 from .transforms import *  # noqa: F401, F403
 from .video_dataset import VideoDataset
+from .video_text_dataset import VideoTextDataset
 
 __all__ = [
     'AVADataset', 'AVAKineticsDataset', 'ActivityNetDataset', 'AudioDataset',
     'BaseActionDataset', 'PoseDataset', 'RawframeDataset', 'RepeatAugDataset',
-    'VideoDataset', 'repeat_pseudo_collate'
+    'VideoDataset', 'repeat_pseudo_collate', 'VideoTextDataset'
 ]
diff --git a/mmaction/datasets/ava_dataset.py b/mmaction/datasets/ava_dataset.py
index 1bc64c7b91..e744dc9f5e 100644
--- a/mmaction/datasets/ava_dataset.py
+++ b/mmaction/datasets/ava_dataset.py
@@ -15,10 +15,10 @@
 
 @DATASETS.register_module()
 class AVADataset(BaseActionDataset):
-    """AVA dataset for spatial temporal detection.
+    """STAD dataset for spatial temporal action detection.
 
-    Based on official AVA annotation files, the dataset loads raw frames,
-    bounding boxes, proposals and applies specified transformations to return
+    The dataset loads raw frames/video files, bounding boxes,
+    proposals and applies specified transformations to return
     a dict containing the frame tensors and other information.
 
     This datasets can load information from the following files:
@@ -62,8 +62,8 @@ class AVADataset(BaseActionDataset):
         filename_tmpl (str): Template for each filename.
             Defaults to 'img_{:05}.jpg'.
         start_index (int): Specify a start index for frames in consideration of
-            different filename format. However, when taking frames as input,
-            it should be set to 0, since frames from 0. Defaults to 0.
+            different filename format. It should be set to 1 for AVA, since
+            frame index start from 1 in AVA dataset. Defaults to 1.
         proposal_file (str): Path to the proposal file like
             ``ava_dense_proposals_{train, val}.FAIR.recall_93.9.pkl``.
             Defaults to None.
@@ -91,16 +91,22 @@ class AVADataset(BaseActionDataset):
             Defaults to 902.
         timestamp_end (int): The end point of included timestamps. The default
             value is referred from the official website. Defaults to 1798.
-        fps (int): Overrides the default FPS for the dataset. Defaults to 30.
+        use_frames (bool): Whether to use rawframes as input.
+            Defaults to True.
+        fps (int): Overrides the default FPS for the dataset. If set to 1,
+            means counting timestamp by frame, e.g. MultiSports dataset.
+            Otherwise by second. Defaults to 30.
+        multilabel (bool): Determines whether it is a multilabel recognition
+            task. Defaults to True.
     """
 
     def __init__(self,
                  ann_file: str,
-                 exclude_file: str,
                  pipeline: List[Union[ConfigType, Callable]],
-                 label_file: str,
+                 exclude_file: Optional[str] = None,
+                 label_file: Optional[str] = None,
                  filename_tmpl: str = 'img_{:05}.jpg',
-                 start_index: int = 0,
+                 start_index: int = 1,
                  proposal_file: str = None,
                  person_det_score_thr: float = 0.9,
                  num_classes: int = 81,
@@ -111,7 +117,9 @@ def __init__(self,
                  num_max_proposals: int = 1000,
                  timestamp_start: int = 900,
                  timestamp_end: int = 1800,
+                 use_frames: bool = True,
                  fps: int = 30,
+                 multilabel: bool = True,
                  **kwargs) -> None:
         self._FPS = fps  # Keep this as standard
         self.custom_classes = custom_classes
@@ -133,6 +141,8 @@ def __init__(self,
         self.timestamp_end = timestamp_end
         self.num_max_proposals = num_max_proposals
         self.filename_tmpl = filename_tmpl
+        self.use_frames = use_frames
+        self.multilabel = multilabel
 
         super().__init__(
             ann_file,
@@ -185,8 +195,11 @@ def parse_img_record(self, img_records: List[dict]) -> tuple:
             ])
 
             # The format can be directly used by BCELossWithLogits
-            label = np.zeros(self.num_classes, dtype=np.float32)
-            label[valid_labels] = 1.
+            if self.multilabel:
+                label = np.zeros(self.num_classes, dtype=np.float32)
+                label[valid_labels] = 1.
+            else:
+                label = valid_labels
 
             labels.append(label)
             entity_ids.append(img_record['entity_id'])
@@ -212,13 +225,17 @@ def load_data_list(self) -> List[dict]:
                 label = self.custom_classes.index(label)
 
             video_id = line_split[0]
-            timestamp = int(line_split[1])
+            timestamp = int(line_split[1])  # count by second or frame.
             img_key = f'{video_id},{timestamp:04d}'
 
             entity_box = np.array(list(map(float, line_split[2:6])))
             entity_id = int(line_split[7])
-            shot_info = (0, (self.timestamp_end - self.timestamp_start) *
-                         self._FPS)
+            if self.use_frames:
+                shot_info = (0, (self.timestamp_end - self.timestamp_start) *
+                             self._FPS)
+            # for video data, automatically get shot info when decoding
+            else:
+                shot_info = None
 
             video_info = dict(
                 video_id=video_id,
@@ -246,6 +263,8 @@ def load_data_list(self) -> List[dict]:
                 shot_info=shot_info,
                 fps=self._FPS,
                 ann=ann)
+            if not self.use_frames:
+                video_info['filename'] = video_info.pop('frame_dir')
             data_list.append(video_info)
 
         return data_list
@@ -301,6 +320,12 @@ def get_data_info(self, idx: int) -> dict:
                     proposals = proposals[:self.num_max_proposals]
                     data_info['proposals'] = proposals
 
+                assert data_info['proposals'].max() <= 1 and \
+                    data_info['proposals'].min() >= 0, \
+                    (f'relative proposals invalid: max value '
+                     f'{data_info["proposals"].max()}, min value '
+                     f'{data_info["proposals"].min()}')
+
         ann = data_info.pop('ann')
         data_info['gt_bboxes'] = ann['gt_bboxes']
         data_info['gt_labels'] = ann['gt_labels']
diff --git a/mmaction/datasets/rawframe_dataset.py b/mmaction/datasets/rawframe_dataset.py
index 8089e75917..2d42201d64 100644
--- a/mmaction/datasets/rawframe_dataset.py
+++ b/mmaction/datasets/rawframe_dataset.py
@@ -133,7 +133,9 @@ def load_data_list(self) -> List[dict]:
                 idx += 1
             # idx for label[s]
             label = [int(x) for x in line_split[idx:]]
-            assert label, f'missing label in line: {line}'
+            # add fake label for inference datalist without label
+            if not label:
+                label = [-1]
             if self.multi_class:
                 assert self.num_classes is not None
                 video_info['label'] = label
diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py
index 2b83c415f5..d8b8cc4eb3 100644
--- a/mmaction/datasets/transforms/__init__.py
+++ b/mmaction/datasets/transforms/__init__.py
@@ -19,6 +19,7 @@
                          MelSpectrogram, MultiScaleCrop, RandomCrop,
                          RandomRescale, RandomResizedCrop, Resize, TenCrop,
                          ThreeCrop)
+from .text_transforms import CLIPTokenize
 from .wrappers import ImgAug, PytorchVideoWrapper, TorchVisionWrapper
 
 __all__ = [
@@ -38,5 +39,5 @@
     'Resize', 'SampleAVAFrames', 'SampleFrames', 'TenCrop', 'ThreeCrop',
     'ToMotion', 'TorchVisionWrapper', 'Transpose', 'UniformSample',
     'UniformSampleFrames', 'UntrimmedSampleFrames', 'MMUniformSampleFrames',
-    'MMDecode', 'MMCompact'
+    'MMDecode', 'MMCompact', 'CLIPTokenize'
 ]
diff --git a/mmaction/datasets/transforms/formatting.py b/mmaction/datasets/transforms/formatting.py
index bdcc75ffb5..6ca61a4ccc 100644
--- a/mmaction/datasets/transforms/formatting.py
+++ b/mmaction/datasets/transforms/formatting.py
@@ -72,10 +72,14 @@ def transform(self, results: Dict) -> Dict:
             elif 'audios' in results:
                 audios = results['audios']
                 packed_results['inputs'] = to_tensor(audios)
+            elif 'text' in results:
+                text = results['text']
+                packed_results['inputs'] = to_tensor(text)
             else:
                 raise ValueError(
-                    'Cannot get `imgs`, `keypoint`, `heatmap_imgs` '
-                    'or `audios` in the input dict of `PackActionInputs`.')
+                    'Cannot get `imgs`, `keypoint`, `heatmap_imgs`, '
+                    '`audios` or `text` in the input dict of '
+                    '`PackActionInputs`.')
 
         data_sample = ActionDataSample()
 
@@ -89,7 +93,8 @@ def transform(self, results: Dict) -> Dict:
             if 'proposals' in results:
                 data_sample.proposals = InstanceData(
                     bboxes=to_tensor(results['proposals']))
-        else:
+
+        if 'label' in results:
             label_data = LabelData()
             label_data.item = to_tensor(results['label'])
             data_sample.gt_labels = label_data
@@ -138,11 +143,21 @@ def transform(self, results):
                 'dict of `PackActionInputs`.')
 
         data_sample = ActionDataSample()
-        instance_data = InstanceData()
         for key in self.keys:
-            if key in results:
+            if key not in results:
+                continue
+            if key == 'gt_bbox':
+                instance_data = InstanceData()
                 instance_data[key] = to_tensor(results[key])
-        data_sample.gt_instances = instance_data
+                data_sample.gt_instances = instance_data
+            elif key == 'proposals':
+                instance_data = InstanceData()
+                instance_data[key] = to_tensor(results[key])
+                data_sample.proposals = instance_data
+            else:
+                raise NotImplementedError(
+                    f"Key '{key}' is not supported in `PackLocalizationInputs`"
+                )
 
         img_meta = {k: results[k] for k in self.meta_keys if k in results}
         data_sample.set_metainfo(img_meta)
diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
index 10309b2516..e876143cd3 100644
--- a/mmaction/datasets/transforms/loading.py
+++ b/mmaction/datasets/transforms/loading.py
@@ -411,32 +411,32 @@ def __repr__(self) -> str:
 
 @TRANSFORMS.register_module()
 class UniformSample(BaseTransform):
-    """Uniformly sample frames from the video. Currently used for Something-
-    Something V2 dataset. Modified from
-    https://github.com/facebookresearch/SlowFast/blob/64a
+    """Uniformly sample frames from the video.
+
+    Modified from https://github.com/facebookresearch/SlowFast/blob/64a
     bcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159.
 
-    To sample an n-frame clip from the video. UniformSampleFrames basically
+    To sample an n-frame clip from the video. UniformSample basically
     divides the video into n segments of equal length and randomly samples one
     frame from each segment.
 
     Required keys:
 
-    - total_frames
-    - start_index
+        - total_frames
+        - start_index
 
     Added keys:
 
-    - frame_inds
-    - clip_len
-    - frame_interval
-    - num_clips
+        - frame_inds
+        - clip_len
+        - frame_interval
+        - num_clips
 
     Args:
         clip_len (int): Frames of each sampled output clip.
-        num_clips (int): Number of clips to be sampled. Default: 1.
+        num_clips (int): Number of clips to be sampled. Defaults to 1.
         test_mode (bool): Store True when building test or validation dataset.
-            Default: False.
+            Defaults to False.
     """
 
     def __init__(self,
@@ -448,17 +448,24 @@ def __init__(self,
         self.num_clips = num_clips
         self.test_mode = test_mode
 
-    def _get_sample_clips(self, num_frames: int) -> np.array:
-        """When video frames is shorter than target clip len, this strategy
-        would repeat sample frame, rather than loop sample in 'loop' mode. In
-        test mode, this strategy would sample the middle frame of each segment,
-        rather than set a random seed, and therefore only support sample 1
-        clip.
+    def _get_sample_clips(self, num_frames: int) -> np.ndarray:
+        """To sample an n-frame clip from the video. UniformSample basically
+        divides the video into n segments of equal length and randomly samples
+        one frame from each segment. When the duration of video frames is
+        shorter than the desired length of the target clip, this approach will
+        duplicate the sampled frame instead of looping the sample in "loop"
+        mode. In the test mode, when we need to sample multiple clips,
+        specifically 'n' clips, this method will further divide the segments
+        based on the number of clips to be sampled. The 'i-th' clip will.
+
+        sample the frame located at the position 'i * len(segment) / n'
+        within the segment.
 
         Args:
             num_frames (int): Total number of frame in the video.
+
         Returns:
-            seq (list): the indexes of frames of sampled from the video.
+            seq (np.ndarray): the indexes of frames of sampled from the video.
         """
         seg_size = float(num_frames - 1) / self.clip_len
         inds = []
@@ -477,7 +484,15 @@ def _get_sample_clips(self, num_frames: int) -> np.array:
 
         return np.array(inds)
 
-    def transform(self, results: dict):
+    def transform(self, results: Dict) -> Dict:
+        """Perform the Uniform Sampling.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
         num_frames = results['total_frames']
 
         inds = self._get_sample_clips(num_frames)
@@ -490,7 +505,7 @@ def transform(self, results: dict):
         results['num_clips'] = self.num_clips
         return results
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         repr_str = (f'{self.__class__.__name__}('
                     f'clip_len={self.clip_len}, '
                     f'num_clips={self.num_clips}, '
@@ -503,16 +518,19 @@ class UntrimmedSampleFrames(BaseTransform):
     """Sample frames from the untrimmed video.
 
     Required keys are "filename", "total_frames", added or modified keys are
-    "frame_inds", "frame_interval" and "num_clips".
+    "frame_inds", "clip_interval" and "num_clips".
 
     Args:
-        clip_len (int): The length of sampled clips. Default: 1.
+        clip_len (int): The length of sampled clips. Defaults to  1.
+        clip_interval (int): Clip interval of adjacent center of sampled
+            clips. Defaults to 16.
         frame_interval (int): Temporal interval of adjacent sampled frames.
-            Default: 16.
+            Defaults to 1.
     """
 
-    def __init__(self, clip_len=1, frame_interval=16):
+    def __init__(self, clip_len=1, clip_interval=16, frame_interval=1):
         self.clip_len = clip_len
+        self.clip_interval = clip_interval
         self.frame_interval = frame_interval
 
     def transform(self, results):
@@ -525,18 +543,21 @@ def transform(self, results):
         total_frames = results['total_frames']
         start_index = results['start_index']
 
-        clip_centers = np.arange(self.frame_interval // 2, total_frames,
-                                 self.frame_interval)
+        clip_centers = np.arange(self.clip_interval // 2, total_frames,
+                                 self.clip_interval)
         num_clips = clip_centers.shape[0]
         frame_inds = clip_centers[:, None] + np.arange(
-            -(self.clip_len // 2), self.clip_len -
-            (self.clip_len // 2))[None, :]
+            -(self.clip_len // 2 * self.frame_interval),
+            self.frame_interval *
+            (self.clip_len -
+             (self.clip_len // 2)), self.frame_interval)[None, :]
         # clip frame_inds to legal range
         frame_inds = np.clip(frame_inds, 0, total_frames - 1)
 
         frame_inds = np.concatenate(frame_inds) + start_index
         results['frame_inds'] = frame_inds.astype(np.int32)
         results['clip_len'] = self.clip_len
+        results['clip_interval'] = self.clip_interval
         results['frame_interval'] = self.frame_interval
         results['num_clips'] = num_clips
         return results
@@ -544,6 +565,7 @@ def transform(self, results):
     def __repr__(self):
         repr_str = (f'{self.__class__.__name__}('
                     f'clip_len={self.clip_len}, '
+                    f'clip_interval={self.clip_interval}, '
                     f'frame_interval={self.frame_interval})')
         return repr_str
 
@@ -731,15 +753,18 @@ def transform(self, results):
         fps = results['fps']
         timestamp = results['timestamp']
         timestamp_start = results['timestamp_start']
-        shot_info = results['shot_info']
+        start_index = results.get('start_index', 0)
+        if results.get('total_frames') is not None:
+            shot_info = (0, results['total_frames'])
+        else:
+            shot_info = results['shot_info']
 
-        center_index = fps * (timestamp - timestamp_start) + 1
+        center_index = fps * (timestamp - timestamp_start) + start_index
 
         skip_offsets = np.random.randint(
             -self.frame_interval // 2, (self.frame_interval + 1) // 2,
             size=self.clip_len)
         frame_inds = self._get_clips(center_index, skip_offsets, shot_info)
-        start_index = results.get('start_index', 0)
 
         frame_inds = np.array(frame_inds, dtype=np.int32) + start_index
         results['frame_inds'] = frame_inds
@@ -1210,6 +1235,18 @@ def transform(self, results: Dict) -> Dict:
         results['original_shape'] = imgs[0].shape[:2]
         results['img_shape'] = imgs[0].shape[:2]
 
+        # we resize the gt_bboxes and proposals to their real scale
+        if 'gt_bboxes' in results:
+            h, w = results['img_shape']
+            scale_factor = np.array([w, h, w, h])
+            gt_bboxes = results['gt_bboxes']
+            gt_bboxes = (gt_bboxes * scale_factor).astype(np.float32)
+            results['gt_bboxes'] = gt_bboxes
+            if 'proposals' in results and results['proposals'] is not None:
+                proposals = results['proposals']
+                proposals = (proposals * scale_factor).astype(np.float32)
+                results['proposals'] = proposals
+
         return results
 
     def __repr__(self) -> str:
diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py
index 0abb987551..0420f4ec8d 100644
--- a/mmaction/datasets/transforms/pose_transforms.py
+++ b/mmaction/datasets/transforms/pose_transforms.py
@@ -121,7 +121,10 @@ def mapinds(inds):
             kps[:, :, 1] *= h
 
         num_kp = kps.shape[1]
-        num_person = mode(frame_inds)[-1][0]
+        num_person = mode(frame_inds)[-1]
+        # Ensure compatibility with lower version of scipy
+        if isinstance(num_person, np.ndarray):
+            num_person = num_person[0]
 
         new_kp = np.zeros([num_person, total_frames, num_kp, 2],
                           dtype=np.float16)
diff --git a/mmaction/datasets/transforms/text_transforms.py b/mmaction/datasets/transforms/text_transforms.py
new file mode 100644
index 0000000000..08b5e225ea
--- /dev/null
+++ b/mmaction/datasets/transforms/text_transforms.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+from mmcv.transforms import BaseTransform
+
+from mmaction.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class CLIPTokenize(BaseTransform):
+    """Tokenize text and convert to tensor."""
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`CLIPTokenize`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+
+        try:
+            import clip
+        except ImportError:
+            raise ImportError('Please run `pip install '
+                              'git+https://github.com/openai/CLIP.git` '
+                              'to install clip first. ')
+
+        text = results['text']
+        text_tokenized = clip.tokenize(text)[0]
+        results['text'] = text_tokenized
+        return results
diff --git a/mmaction/datasets/video_dataset.py b/mmaction/datasets/video_dataset.py
index e085a8bcac..7d6cd5d44f 100644
--- a/mmaction/datasets/video_dataset.py
+++ b/mmaction/datasets/video_dataset.py
@@ -86,6 +86,9 @@ def load_data_list(self) -> List[dict]:
                 assert self.num_classes is not None
                 filename, label = line_split[0], line_split[1:]
                 label = list(map(int, label))
+            # add fake label for inference datalist without label
+            elif len(line_split) == 1:
+                filename, label = line_split[0], -1
             else:
                 filename, label = line_split
                 label = int(label)
diff --git a/mmaction/datasets/video_text_dataset.py b/mmaction/datasets/video_text_dataset.py
new file mode 100644
index 0000000000..c6f011dbf7
--- /dev/null
+++ b/mmaction/datasets/video_text_dataset.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from typing import Dict, List
+
+from mmengine.fileio import exists
+
+from mmaction.registry import DATASETS
+from .base import BaseActionDataset
+
+
+@DATASETS.register_module()
+class VideoTextDataset(BaseActionDataset):
+    """Video dataset for video-text task like video retrieval."""
+
+    def load_data_list(self) -> List[Dict]:
+        """Load annotation file to get video information."""
+        exists(self.ann_file)
+        data_list = []
+
+        with open(self.ann_file) as f:
+            video_dict = json.load(f)
+            for filename, texts in video_dict.items():
+                filename = osp.join(self.data_prefix['video'], filename)
+                video_text_pairs = []
+                for text in texts:
+                    data_item = dict(filename=filename, text=text)
+                    video_text_pairs.append(data_item)
+                data_list.extend(video_text_pairs)
+
+        return data_list
diff --git a/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py b/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py
index 966b786508..bee8dd6771 100644
--- a/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py
+++ b/mmaction/engine/optimizers/layer_decay_optim_wrapper_constructor.py
@@ -10,6 +10,28 @@
 from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS
 
 
+def get_layer_id_for_vit(var_name: str, max_layer_id: int) -> int:
+    """Get the layer id to set the different learning rates for ViT.
+
+    Args:
+        var_name (str): The key of the model.
+        num_max_layer (int): Maximum number of backbone layers.
+    Returns:
+        int: Returns the layer id of the key.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.patch_embed'):
+        return 0
+    elif var_name.startswith('backbone.blocks'):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return max_layer_id + 1
+
+
 def get_layer_id_for_mvit(var_name, max_layer_id):
     """Get the layer id to set the different learning rates in ``layer_wise``
     decay_type.
@@ -87,6 +109,9 @@ def add_params(self, params: List[dict], module: nn.Module,
                     layer_id = get_layer_id_for_mvit(
                         name, self.paramwise_cfg.get('num_layers'))
                     logger.info(f'set param {name} as id {layer_id}')
+                elif 'VisionTransformer' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_vit(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
                 else:
                     raise NotImplementedError()
             else:
diff --git a/mmaction/evaluation/functional/__init__.py b/mmaction/evaluation/functional/__init__.py
index f33a33027e..4a729294c4 100644
--- a/mmaction/evaluation/functional/__init__.py
+++ b/mmaction/evaluation/functional/__init__.py
@@ -7,6 +7,7 @@
                        softmax, top_k_accuracy, top_k_classes)
 from .ava_utils import ava_eval, read_labelmap, results2csv
 from .eval_detection import ActivityNetLocalization
+from .multisports_utils import frameAP, link_tubes, videoAP, videoAP_all
 
 __all__ = [
     'top_k_accuracy', 'mean_class_accuracy', 'confusion_matrix',
@@ -14,5 +15,6 @@
     'average_recall_at_avg_proposals', 'pairwise_temporal_iou',
     'average_precision_at_temporal_iou', 'ActivityNetLocalization', 'softmax',
     'interpolated_precision_recall', 'mmit_mean_average_precision',
-    'top_k_classes', 'read_labelmap', 'ava_eval', 'results2csv'
+    'top_k_classes', 'read_labelmap', 'ava_eval', 'results2csv', 'frameAP',
+    'videoAP', 'link_tubes', 'videoAP_all'
 ]
diff --git a/mmaction/evaluation/functional/multisports_utils.py b/mmaction/evaluation/functional/multisports_utils.py
new file mode 100644
index 0000000000..516828c701
--- /dev/null
+++ b/mmaction/evaluation/functional/multisports_utils.py
@@ -0,0 +1,684 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/MCG-NJU/MultiSports
+# Original licence: Copyright (c) MCG-NJU, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import math
+from collections import defaultdict
+
+import numpy as np
+from rich.progress import track
+
+
+def area2d_voc(b):
+    """Compute the areas for a set of 2D boxes."""
+    return (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
+
+
+def overlap2d_voc(b1, b2):
+    """Compute the overlaps between a set of boxes b1 and one box b2."""
+    xmin = np.maximum(b1[:, 0], b2[:, 0])
+    ymin = np.maximum(b1[:, 1], b2[:, 1])
+    xmax = np.minimum(b1[:, 2], b2[:, 2])
+    ymax = np.minimum(b1[:, 3], b2[:, 3])
+
+    width = np.maximum(0, xmax - xmin)
+    height = np.maximum(0, ymax - ymin)
+
+    return width * height
+
+
+def iou2d_voc(b1, b2):
+    """Compute the IoU between a set of boxes b1 and 1 box b2."""
+    if b1.ndim == 1:
+        b1 = b1[None, :]
+    if b2.ndim == 1:
+        b2 = b2[None, :]
+
+    assert b2.shape[0] == 1
+
+    ov = overlap2d_voc(b1, b2)
+
+    return ov / (area2d_voc(b1) + area2d_voc(b2) - ov)
+
+
+def iou3d_voc(b1, b2):
+    """Compute the IoU between two tubes with same temporal extent."""
+    assert b1.shape[0] == b2.shape[0]
+    assert np.all(b1[:, 0] == b2[:, 0])
+
+    ov = overlap2d_voc(b1[:, 1:5], b2[:, 1:5])
+
+    return np.mean(ov / (area2d_voc(b1[:, 1:5]) + area2d_voc(b2[:, 1:5]) - ov))
+
+
+def iou3dt_voc(b1, b2, spatialonly=False, temporalonly=False):
+    """Compute the spatio-temporal IoU between two tubes."""
+    tmin = max(b1[0, 0], b2[0, 0])
+    tmax = min(b1[-1, 0], b2[-1, 0])
+
+    if tmax < tmin:
+        return 0.0
+
+    temporal_inter = tmax - tmin
+    temporal_union = max(b1[-1, 0], b2[-1, 0]) - min(b1[0, 0], b2[0, 0])
+
+    tube1 = b1[int(np.where(
+        b1[:, 0] == tmin)[0]):int(np.where(b1[:, 0] == tmax)[0]) + 1, :]
+    tube2 = b2[int(np.where(
+        b2[:, 0] == tmin)[0]):int(np.where(b2[:, 0] == tmax)[0]) + 1, :]
+
+    if temporalonly:
+        return temporal_inter / temporal_union
+    return iou3d_voc(tube1, tube2) * (1. if spatialonly else temporal_inter /
+                                      temporal_union)
+
+
+def pr_to_ap_voc(pr):
+    precision = pr[:, 0]
+    recall = pr[:, 1]
+    recall = np.concatenate([[0], recall, [1]])
+    precision = np.concatenate([[0], precision, [0]])
+
+    # Preprocess precision to be a non-decreasing array
+    for i in range(len(precision) - 2, -1, -1):
+        precision[i] = np.maximum(precision[i], precision[i + 1])
+
+    indices = np.where(recall[1:] != recall[:-1])[0] + 1
+    average_precision = np.sum(
+        (recall[indices] - recall[indices - 1]) * precision[indices])
+    return average_precision
+
+
+def nms_tubelets(dets, overlapThresh=0.3, top_k=None):
+    """Compute the NMS for a set of scored tubelets scored tubelets are numpy
+    array with 4K+1 columns, last one being the score return the indices of the
+    tubelets to keep."""
+
+    # If there are no detections, return an empty list
+    if len(dets) == 0:
+        return dets
+    if top_k is None:
+        top_k = len(dets)
+
+    K = int((dets.shape[1] - 1) / 4)
+
+    # Coordinates of bounding boxes
+    x1 = [dets[:, 4 * k] for k in range(K)]
+    y1 = [dets[:, 4 * k + 1] for k in range(K)]
+    x2 = [dets[:, 4 * k + 2] for k in range(K)]
+    y2 = [dets[:, 4 * k + 3] for k in range(K)]
+
+    # Compute the area of the bounding boxes and sort the bounding
+    # boxes by the bottom-right y-coordinate of the bounding box
+    # area = (x2 - x1 + 1) * (y2 - y1 + 1)
+    scores = dets[:, -1]
+    area = [(x2[k] - x1[k] + 1) * (y2[k] - y1[k] + 1) for k in range(K)]
+    order = np.argsort(scores)[::-1]
+    weight = np.zeros_like(scores) + 1
+    counter = 0
+
+    while order.size > 0:
+        i = order[0]
+        counter += 1
+
+        # Compute overlap
+        xx1 = [np.maximum(x1[k][i], x1[k][order[1:]]) for k in range(K)]
+        yy1 = [np.maximum(y1[k][i], y1[k][order[1:]]) for k in range(K)]
+        xx2 = [np.minimum(x2[k][i], x2[k][order[1:]]) for k in range(K)]
+        yy2 = [np.minimum(y2[k][i], y2[k][order[1:]]) for k in range(K)]
+
+        w = [np.maximum(0, xx2[k] - xx1[k] + 1) for k in range(K)]
+        h = [np.maximum(0, yy2[k] - yy1[k] + 1) for k in range(K)]
+
+        inter_area = [w[k] * h[k] for k in range(K)]
+        ious = sum([
+            inter_area[k] / (area[k][order[1:]] + area[k][i] - inter_area[k])
+            for k in range(K)
+        ])
+        index = np.where(ious > overlapThresh * K)[0]
+        weight[order[index + 1]] = 1 - ious[index]
+
+        index2 = np.where(ious <= overlapThresh * K)[0]
+        order = order[index2 + 1]
+
+    dets[:, -1] = dets[:, -1] * weight
+
+    new_scores = dets[:, -1]
+    new_order = np.argsort(new_scores)[::-1]
+    dets = dets[new_order, :]
+
+    return dets[:top_k, :]
+
+
+class Dataset():
+
+    def __init__(self, anno, frm_alldets) -> None:
+        self.anno = anno
+        self.video_list = self.anno['test_videos'][0]
+        self.nframes = self.anno['nframes']
+        self.labels = self.anno['labels']
+        self.frm_alldets = frm_alldets
+
+    def get_vid_dets(self):
+        self.vid_frm_det = defaultdict(list)
+        for frm_det in self.frm_alldets:
+            vid_idx = int(frm_det[0])
+            vid_name = self.video_list[vid_idx]
+            self.vid_frm_det[vid_name].append(frm_det)
+
+        self.vid_det = dict()
+        for vid_name, vid_frm_dets in self.vid_frm_det.items():
+            self.vid_det[vid_name] = dict()
+            for frm_idx in range(1, self.nframes[vid_name] + 1):
+                self.vid_det[vid_name][frm_idx] = dict()
+                for label_idx in range(len(self.labels)):
+                    self.vid_det[vid_name][frm_idx][label_idx] = np.empty(
+                        shape=(0, 5))
+            for frm_dets in vid_frm_dets:
+                frm_idx = int(frm_dets[1])
+                label_idx = int(frm_dets[2])
+                det = [*frm_dets[-4:], frm_det[3]]
+                det = np.array(det)[None, :]
+
+                self.vid_det[vid_name][frm_idx][label_idx] = np.concatenate(
+                    [self.vid_det[vid_name][frm_idx][label_idx], det])
+
+        return self.vid_det
+
+
+def link_tubes(anno, frm_dets, K=1, len_thre=15):
+
+    dataset = Dataset(anno, frm_dets)
+    vlist = dataset.video_list
+    total_VDets = dataset.get_vid_dets()
+
+    total_video_tubes = {label: [] for label in range(len(dataset.labels))}
+    for v in track(vlist, description='linking tubes...'):
+
+        RES = {}
+        if v not in total_VDets:
+            continue
+        VDets = total_VDets[v]
+        for ilabel in range(len(dataset.labels)):
+            FINISHED_TUBES = []
+            CURRENT_TUBES = []  # tubes is a list of tuple (frame, lstubelets)
+
+            # calculate average scores of tubelets in tubes
+
+            def tubescore(tt):
+                return np.mean(
+                    np.array([tt[i][1][-1] for i in range(len(tt))]))
+
+            for frame in range(1, dataset.nframes[v] + 2 - K):
+                # load boxes of the new frame and do nms while keeping Nkeep highest scored # noqa: E501
+                ltubelets = np.array(
+                    VDets[frame][ilabel]
+                )  # [:,range(4*K) + [4*K + 1 + ilabel]]  Nx(4K+1) with (x1 y1 x2 y2)*K ilabel-score  # noqa: E501
+
+                ltubelets = nms_tubelets(ltubelets, 0.6, top_k=10)
+
+                # just start new tubes
+                if frame == 1:
+                    for i in range(ltubelets.shape[0]):
+                        CURRENT_TUBES.append([(1, ltubelets[i, :])])
+                    continue
+
+                # sort current tubes according to average score
+                avgscore = [tubescore(t) for t in CURRENT_TUBES]
+                argsort = np.argsort(-np.array(avgscore))
+                CURRENT_TUBES = [CURRENT_TUBES[i] for i in argsort]
+                # loop over tubes
+                finished = []
+                for it, t in enumerate(CURRENT_TUBES):
+                    # compute ious between the last box of t and ltubelets
+                    last_frame, last_tubelet = t[-1]
+                    ious = []
+                    offset = frame - last_frame
+                    if offset < K:
+                        nov = K - offset
+                        ious = sum([
+                            iou2d_voc(
+                                ltubelets[:, 4 * iov:4 * iov + 4],
+                                last_tubelet[4 * (iov + offset):4 *
+                                             (iov + offset + 1)])
+                            for iov in range(nov)
+                        ]) / float(nov)
+                    else:
+                        ious = iou2d_voc(ltubelets[:, :4],
+                                         last_tubelet[4 * K - 4:4 * K])
+
+                    valid = np.where(ious >= 0.5)[0]
+
+                    if valid.size > 0:
+                        # take the one with maximum score
+                        idx = valid[np.argmax(ltubelets[valid, -1])]
+                        CURRENT_TUBES[it].append((frame, ltubelets[idx, :]))
+                        ltubelets = np.delete(ltubelets, idx, axis=0)
+                    else:
+                        if offset >= K:
+                            finished.append(it)
+
+                # finished tubes that are done
+                for it in finished[::
+                                   -1]:  # process in reverse order to delete them with the right index why --++-- # noqa: E501
+                    FINISHED_TUBES.append(CURRENT_TUBES[it][:])
+                    del CURRENT_TUBES[it]
+
+                # start new tubes
+                for i in range(ltubelets.shape[0]):
+                    CURRENT_TUBES.append([(frame, ltubelets[i, :])])
+
+            # all tubes are not finished
+            FINISHED_TUBES += CURRENT_TUBES
+
+            # build real tubes
+            output = []
+            for t in FINISHED_TUBES:
+                score = tubescore(t)
+
+                # just start new tubes
+                if score < 0.005:
+                    continue
+
+                beginframe = t[0][0]
+                endframe = t[-1][0] + K - 1
+                length = endframe + 1 - beginframe
+
+                # delete tubes with short duraton
+                if length < len_thre:
+                    continue
+
+                # build final tubes by average the tubelets
+                out = np.zeros((length, 6), dtype=np.float32)
+                out[:, 0] = np.arange(beginframe, endframe + 1)
+                n_per_frame = np.zeros((length, 1), dtype=np.int32)
+                for i in range(len(t)):
+                    frame, box = t[i]
+                    for k in range(K):
+                        out[frame - beginframe + k,
+                            1:5] += box[4 * k:4 * k + 4]
+                        out[frame - beginframe + k,
+                            -1] += box[-1]  # single frame confidence
+                        n_per_frame[frame - beginframe + k, 0] += 1
+                out[:, 1:] /= n_per_frame
+                output.append([out, score])
+                # out: [num_frames, (frame idx, x1, y1, x2, y2, score)]
+
+            RES[ilabel] = output
+            if output:
+                for tube, tube_score in output:
+                    video_tube_res = tuple([v, tube_score, tube])
+                    total_video_tubes[ilabel].append(video_tube_res)
+    return total_video_tubes
+
+
+def frameAP(GT, alldets, thr, print_info=True):
+
+    vlist = GT['test_videos'][0]
+
+    results = {}
+    for ilabel, label in enumerate(GT['labels']):
+        # detections of this class
+        if label in [
+                'aerobic kick jump', 'aerobic off axis jump',
+                'aerobic butterfly jump', 'aerobic balance turn',
+                'basketball save', 'basketball jump ball'
+        ]:
+            if print_info:
+                print('do not evaluate {}'.format(label))
+            continue
+        # det format: <video_index><frame_number><label_index><score><x1><y1><x2><y2> # noqa: E501
+        detections = alldets[alldets[:, 2] == ilabel, :]
+
+        # load ground-truth of this class
+        gt = {}
+        for iv, v in enumerate(vlist):
+            tubes = GT['gttubes'][v]
+
+            if ilabel not in tubes:
+                continue
+
+            for tube in tubes[ilabel]:
+                for i in range(tube.shape[0]):
+                    k = (iv, int(tube[i, 0]))  # k -> (video_idx, frame_idx)
+                    if k not in gt:
+                        gt[k] = []
+                    gt[k].append(tube[i, 1:5].tolist())
+
+        for k in gt:
+            gt[k] = np.array(gt[k])
+
+        # pr will be an array containing precision-recall values
+        pr = np.empty((detections.shape[0], 2),
+                      dtype=np.float64)  # precision,recall
+        gt_num = sum([g.shape[0] for g in gt.values()])
+        if gt_num == 0:
+            if print_info:
+                print('no such label', ilabel, label)
+            continue
+        fp = 0  # false positives
+        tp = 0  # true positives
+
+        is_gt_box_detected = {}
+        for i, j in enumerate(np.argsort(-detections[:, 3])):
+            k = (int(detections[j, 0]), int(detections[j, 1]))
+            box = detections[j, 4:8]
+            ispositive = False
+
+            if k in gt:
+                # match gt_box according to the iou
+                if k not in is_gt_box_detected:
+                    is_gt_box_detected[k] = np.zeros(
+                        gt[k].shape[0], dtype=bool)
+                ious = iou2d_voc(gt[k], box)
+                amax = np.argmax(ious)
+
+                if ious[amax] >= thr:
+                    if not is_gt_box_detected[k][amax]:
+                        ispositive = True
+                        is_gt_box_detected[k][amax] = True
+
+            if ispositive:
+                tp += 1
+            else:
+                fp += 1
+            pr[i, 0] = float(tp) / float(tp + fp)
+            pr[i, 1] = float(tp) / float(gt_num)
+
+        results[label] = pr
+
+    # display results
+    ap = 100 * np.array([pr_to_ap_voc(results[label]) for label in results])
+    class_result = {}
+    for label in results:
+        class_result[label] = pr_to_ap_voc(results[label]) * 100
+    frameap_result = np.mean(ap)
+    if print_info:
+        print('frameAP_{}\n'.format(thr))
+        for label in class_result:
+            print('{:20s} {:8.2f}'.format(label, class_result[label]))
+        print('{:20s} {:8.2f}'.format('mAP', frameap_result))
+    return frameap_result
+
+
+def videoAP(GT, alldets, thr, print_info=True):
+
+    vlist = GT['test_videos'][0]
+
+    res = {}
+    for ilabel in range(len(GT['labels'])):
+        if GT['labels'][ilabel] in [
+                'aerobic kick jump', 'aerobic off axis jump',
+                'aerobic butterfly jump', 'aerobic balance turn',
+                'basketball save', 'basketball jump ball'
+        ]:
+            if print_info:
+                print('do not evaluate{}'.format(GT['labels'][ilabel]))
+            continue
+        detections = alldets[ilabel]
+        # load ground-truth
+        gt = {}
+        for v in vlist:
+            tubes = GT['gttubes'][v]
+
+            if ilabel not in tubes:
+                continue
+
+            gt[v] = tubes[ilabel]
+
+            if len(gt[v]) == 0:
+                del gt[v]
+
+        # precision,recall
+        pr = np.empty((len(detections), 2), dtype=np.float64)
+
+        gt_num = sum([len(g) for g in gt.values()])  # false negatives
+        fp = 0  # false positives
+        tp = 0  # true positives
+        if gt_num == 0:
+            if print_info:
+                print('no such label', ilabel, GT['labels'][ilabel])
+            continue
+        is_gt_box_detected = {}
+        for i, j in enumerate(
+                np.argsort(-np.array([dd[1] for dd in detections]))):
+            v, score, tube = detections[j]
+            ispositive = False
+            if v in gt:
+                if v not in is_gt_box_detected:
+                    is_gt_box_detected[v] = np.zeros(len(gt[v]), dtype=bool)
+                ious = [iou3dt_voc(g, tube) for g in gt[v]]
+                amax = np.argmax(ious)
+                if ious[amax] >= thr:
+                    if not is_gt_box_detected[v][amax]:
+                        ispositive = True
+                        is_gt_box_detected[v][amax] = True
+
+            if ispositive:
+                tp += 1
+            else:
+                fp += 1
+
+            pr[i, 0] = float(tp) / float(tp + fp)
+            pr[i, 1] = float(tp) / float(gt_num)
+        res[GT['labels'][ilabel]] = pr
+
+    # display results
+    ap = 100 * np.array([pr_to_ap_voc(res[label]) for label in res])
+    videoap_result = np.mean(ap)
+    class_result = {}
+    for label in res:
+        class_result[label] = pr_to_ap_voc(res[label]) * 100
+    if print_info:
+        print('VideoAP_{}\n'.format(thr))
+        for label in class_result:
+            print('{:20s} {:8.2f}'.format(label, class_result[label]))
+        print('{:20s} {:8.2f}'.format('mAP', videoap_result))
+    return videoap_result
+
+
+def videoAP_all(groundtruth, detections):
+    high_ap = 0
+    for i in range(10):
+        thr = 0.5 + 0.05 * i
+        high_ap += videoAP(groundtruth, detections, thr, print_info=False)
+    high_ap = high_ap / 10.0
+
+    low_ap = 0
+    for i in range(9):
+        thr = 0.05 + 0.05 * i
+        low_ap += videoAP(groundtruth, detections, thr, print_info=False)
+    low_ap = low_ap / 9.0
+
+    all_ap = 0
+    for i in range(9):
+        thr = 0.1 + 0.1 * i
+        all_ap += videoAP(groundtruth, detections, thr, print_info=False)
+    all_ap = all_ap / 9.0
+
+    map = {
+        'v_map_0.05:0.45': round(low_ap, 4),
+        'v_map_0.10:0.90': round(all_ap, 4),
+        'v_map_0.50:0.95': round(high_ap, 4),
+    }
+    return map
+
+
+def videoAP_error(GT, alldets, thr):
+
+    vlist = GT['test_videos'][0]
+
+    th_s = math.sqrt(thr)
+    th_t = math.sqrt(thr)
+
+    print('th is', thr)
+    print('th_s is', th_s)
+    print('th_t is', th_t)
+
+    res = {}
+    dupgt = {}
+    for v in vlist:
+        dupgt[v] = GT['gttubes'][v]
+    # compute video error for every class
+    for ilabel in range(len(GT['labels'])):
+        if GT['labels'][ilabel] in [
+                'aerobic kick jump', 'aerobic off axis jump',
+                'aerobic butterfly jump', 'aerobic balance turn',
+                'basketball save', 'basketball jump ball'
+        ]:
+            print('do not evaluate {}'.format(GT['labels'][ilabel]))
+            continue
+        detections = alldets[ilabel]
+
+        pr = np.zeros((len(detections), 11), dtype=np.float32)
+
+        gt_num = 0
+        for v in dupgt:
+            if ilabel in dupgt[v]:
+                gt_num = gt_num + len(dupgt[v][ilabel])
+        fp = 0  # false positives
+        tp = 0  # true positives
+        ER = 0  # repeat error repeat predict for the same instance
+        EN = 0  # extra error
+        EL = 0  # localization errors
+        EC = 0  # classification error
+        ET = 0  # timing error
+        ErrCT = 0  # cls + time
+        ECL = 0  # cls + loc
+        ETL = 0  # time + loc
+        ECTL = 0  # cls + time + loc
+
+        is_gt_box_detected = {}
+        for i, j in enumerate(
+                np.argsort(-np.array([dd[1] for dd in detections]))):
+            v, score, tube = detections[j]
+            ispositive = False
+            end = False
+            if ilabel in dupgt[v]:
+                if v not in is_gt_box_detected:
+                    is_gt_box_detected[v] = np.zeros(
+                        len(dupgt[v][ilabel]), dtype=bool)
+                ious = [iou3dt_voc(g, tube) for g in dupgt[v][ilabel]]
+                amax = np.argmax(ious)
+                if ious[amax] >= thr:
+                    if not is_gt_box_detected[v][amax]:
+                        ispositive = True
+                        is_gt_box_detected[v][amax] = True
+                    else:
+                        ER += 1
+                    end = True
+            if end is False:
+                ious = []
+                for ll in dupgt[v]:
+                    if ll == ilabel:
+                        continue
+                    for g in dupgt[v][ll]:
+                        ious.append(iou3dt_voc(g, tube))
+                if ious != []:
+                    amax = np.argmax(ious)
+                    if ious[amax] >= thr:
+                        EC += 1
+                        end = True
+            if end is False:
+                all_gt = []
+                ious = []
+                for ll in dupgt[v]:
+                    for g in dupgt[v][ll]:
+                        all_gt.append((ll, g))
+                        ious.append(iou3dt_voc(g, tube))
+                amax = np.argmax(ious)
+                assert (ious[amax] < thr)
+                if ious[amax] > 0:
+                    t_iou = iou3dt_voc(
+                        all_gt[amax][1], tube, temporalonly=True)
+                    s_iou = iou3dt_voc(all_gt[amax][1], tube, spatialonly=True)
+                    if all_gt[amax][0] == ilabel:
+                        assert (t_iou < th_t or s_iou < th_s)
+                        if t_iou >= th_t:
+                            EL += 1
+                            end = True
+                        elif s_iou >= th_s:
+                            ET += 1
+                            end = True
+                        else:
+                            ETL += 1
+                            end = True
+                    else:
+                        assert (t_iou < th_t or s_iou < th_s)
+                        if t_iou >= th_t:
+                            ECL += 1
+                            end = True
+                        elif s_iou >= th_s:
+                            ErrCT += 1
+                            end = True
+                        else:
+                            ECTL += 1
+                            end = True
+                else:
+                    EN += 1
+                    end = True
+            assert (end is True)
+            if ispositive:
+                tp += 1
+                # fn -= 1
+            else:
+                fp += 1
+            assert (fp == (ER + EN + EL + EC + ET + ErrCT + ECL + ETL + ECTL))
+            pr[i, 0] = max(float(tp) / float(tp + fp), 0.)
+            pr[i, 1] = max(float(tp) / float(gt_num), 0.)
+            pr[i, 2] = max(float(ER) / float(tp + fp), 0.)
+            pr[i, 3] = max(float(EN) / float(tp + fp), 0.)
+            pr[i, 4] = max(float(EL) / float(tp + fp), 0.)
+            pr[i, 5] = max(float(EC) / float(tp + fp), 0.)
+            pr[i, 6] = max(float(ET) / float(tp + fp), 0.)
+            pr[i, 7] = max(float(ErrCT) / float(tp + fp), 0.)
+            pr[i, 8] = max(float(ECL) / float(tp + fp), 0.)
+            pr[i, 9] = max(float(ETL) / float(tp + fp), 0.)
+            pr[i, 10] = max(float(ECTL) / float(tp + fp), 0.)
+
+        res[GT['labels'][ilabel]] = pr
+
+    # display results
+    AP = 100 * np.array([pr_to_ap_voc(res[label][:, [0, 1]]) for label in res])
+    othersap = [
+        100 * np.array([pr_to_ap_voc(res[label][:, [j, 1]]) for label in res])
+        for j in range(2, 11)
+    ]
+
+    ER = othersap[0]
+    EN = othersap[1]
+    EL = othersap[2]
+    EC = othersap[3]
+    ET = othersap[4]
+    ErrCT = othersap[5]
+    ECL = othersap[6]
+    ETL = othersap[7]
+    ECTL = othersap[8]
+    # missed detections = 1-recalll
+    EM = []
+    for label in res:
+        if res[label].shape[0] != 0:
+            EM.append(100 - 100 * res[label][-1, 1])
+        else:
+            EM.append(100)
+    EM = np.array(EM)
+
+    LIST = [AP, ER, EN, EL, EC, ET, ErrCT, ECL, ETL, ECTL, EM]
+
+    print('Error Analysis')
+
+    print('')
+    print(
+        '{:20s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s}'  # noqa: E501
+        .format('label', '  AP ', '  Repeat ', ' Extra ', ' Loc. ', ' Cls. ',
+                ' Time ', ' Cls.+Time ', ' Cls.+Loc. ', ' Time+Loc. ',
+                ' C+T+L ', ' missed '))
+    print('')
+    for il, label in enumerate(res):
+        print('{:20s} '.format(label) +
+              ' '.join(['{:8.2f}'.format(L[il]) for L in LIST]))
+    print('')
+    print('{:20s} '.format('mean') +
+          ' '.join(['{:8.2f}'.format(np.mean(L)) for L in LIST]))
+    print('')
diff --git a/mmaction/evaluation/metrics/__init__.py b/mmaction/evaluation/metrics/__init__.py
index 0493dae036..8bf22c6672 100644
--- a/mmaction/evaluation/metrics/__init__.py
+++ b/mmaction/evaluation/metrics/__init__.py
@@ -2,5 +2,10 @@
 from .acc_metric import AccMetric, ConfusionMatrix
 from .anet_metric import ANetMetric
 from .ava_metric import AVAMetric
+from .multisports_metric import MultiSportsMetric
+from .retrieval_metric import RetrievalMetric
 
-__all__ = ['AccMetric', 'AVAMetric', 'ANetMetric', 'ConfusionMatrix']
+__all__ = [
+    'AccMetric', 'AVAMetric', 'ANetMetric', 'ConfusionMatrix',
+    'MultiSportsMetric', 'RetrievalMetric'
+]
diff --git a/mmaction/evaluation/metrics/multisports_metric.py b/mmaction/evaluation/metrics/multisports_metric.py
new file mode 100644
index 0000000000..18b500fe7e
--- /dev/null
+++ b/mmaction/evaluation/metrics/multisports_metric.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Sequence, Tuple
+
+import numpy as np
+from mmengine import load
+from mmengine.evaluator import BaseMetric
+
+from mmaction.evaluation import frameAP, link_tubes, videoAP, videoAP_all
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class MultiSportsMetric(BaseMetric):
+    """MAP Metric for MultiSports dataset."""
+    default_prefix: Optional[str] = 'mAP'
+
+    def __init__(self,
+                 ann_file: str,
+                 metric_options: Optional[dict] = dict(
+                     F_mAP=dict(thr=(0.5)),
+                     V_mAP=dict(thr=(0.2, 0.5), all=True, tube_thr=15)),
+                 collect_device: str = 'cpu',
+                 verbose: bool = True,
+                 prefix: Optional[str] = None):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.metric_options = metric_options
+        self.annos = load(ann_file)
+        self.verbose = verbose
+
+    def process(self, data_batch: Sequence[Tuple[Any, dict]],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Sequence[Tuple[Any, dict]]): A batch of data
+                from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+
+        for pred in data_samples:
+            video_key = pred['video_id'].split('.mp4')[0]
+            frm_num = pred['timestamp']
+            bboxes = pred['pred_instances']['bboxes'].cpu().numpy()
+            cls_scores = pred['pred_instances']['scores'].cpu().numpy()
+            det_result = [video_key, frm_num, bboxes, cls_scores]
+
+            self.results.append(det_result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        test_videos = self.annos['test_videos'][0]
+        resolutions = self.annos['resolution']
+        detections = []
+        for result in results:
+            video_key, frm_num, bboxes, cls_scores = result
+            for bbox, cls_score in zip(bboxes, cls_scores):
+                video_idx = test_videos.index(video_key)
+                pred_label = np.argmax(cls_score)
+                score = cls_score[pred_label]
+                h, w = resolutions[video_key]
+                bbox *= np.array([w, h, w, h])
+                instance_result = np.array(
+                    [video_idx, frm_num, pred_label, score, *bbox])
+                detections.append(instance_result)
+
+        frm_detections = np.array(detections)
+
+        metric_result = dict()
+        f_map = frameAP(self.annos, frm_detections,
+                        self.metric_options['F_mAP']['thr'], self.verbose)
+        metric_result.update({'frameAP': round(f_map, 4)})
+        video_tubes = link_tubes(
+            self.annos,
+            frm_detections,
+            len_thre=self.metric_options['V_mAP']['tube_thr'])
+
+        v_map = {}
+        for thr in self.metric_options['V_mAP']['thr']:
+            map = videoAP(
+                self.annos, video_tubes, thr, print_info=self.verbose)
+            v_map.update({f'v_map@{thr}': round(map, 4)})
+            metric_result.update(v_map)
+        if self.metric_options['V_mAP'].get('all'):
+            all_map = videoAP_all(self.annos, video_tubes)
+            metric_result.update(all_map)
+        return metric_result
diff --git a/mmaction/evaluation/metrics/retrieval_metric.py b/mmaction/evaluation/metrics/retrieval_metric.py
new file mode 100644
index 0000000000..0375c0c6c5
--- /dev/null
+++ b/mmaction/evaluation/metrics/retrieval_metric.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class RetrievalMetric(BaseMetric):
+    """Metric for video retrieval task.
+
+    Args:
+        metric_list (str | tuple[str]): The list of the metrics to be
+            computed. Defaults to ``('R1', 'R5', 'R10', 'MdR', 'MnR')``.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    default_prefix = 'retrieval'
+
+    def __init__(self,
+                 metric_list: Union[Tuple[str],
+                                    str] = ('R1', 'R5', 'R10', 'MdR', 'MnR'),
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if isinstance(metric_list, str):
+            metric_list = (metric_list, )
+
+        for metric in metric_list:
+            if metric not in ['R1', 'R5', 'R10', 'MdR', 'MnR']:
+                raise ValueError(f'RetrievalMetric only supports '
+                                 f"'R1', 'R5', 'R10', 'MdR', 'MnR', "
+                                 f"but got '{metric}. '")
+
+        self.metric_list = metric_list
+
+    def process(self, data_batch: Optional[Dict],
+                data_samples: Sequence[Dict]) -> None:
+        """Process one batch of data samples and data_samples. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict, optional): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        data_samples = copy.deepcopy(data_samples)
+
+        for data_sample in data_samples:
+            results = dict()
+            features = data_sample['features']
+            video_feature = features['video_feature'].cpu().numpy()
+            text_feature = features['text_feature'].cpu().numpy()
+            results['video_feature'] = video_feature
+            results['text_feature'] = text_feature
+            self.results.append(results)
+
+    def compute_metrics(self, results: List) -> Dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+
+        video_features = np.stack([res['video_feature'] for res in results])
+        text_features = np.stack([res['text_feature'] for res in results])
+
+        video_features = video_features / np.linalg.norm(
+            video_features, axis=-1, keepdims=True)
+        text_features = text_features / np.linalg.norm(
+            text_features, axis=-1, keepdims=True)
+
+        similarity = text_features @ video_features.T
+
+        sx = np.sort(-similarity)
+        d = np.diag(-similarity)
+        ind = np.where((sx - d[:, None]) == 0)[1]
+
+        metrics = OrderedDict()
+        for metric in self.metric_list:
+            if metric == 'R1':
+                metrics['R1'] = float(np.sum(ind == 0)) * 100 / len(ind)
+            elif metric == 'R5':
+                metrics['R5'] = float(np.sum(ind < 5)) * 100 / len(ind)
+            elif metric == 'R10':
+                metrics['R10'] = float(np.sum(ind < 10)) * 100 / len(ind)
+            elif metric == 'MdR':
+                metrics['MdR'] = np.median(ind) + 1
+            elif metric == 'MnR':
+                metrics['MnR'] = np.mean(ind) + 1
+
+        return metrics
diff --git a/mmaction/models/__init__.py b/mmaction/models/__init__.py
index e7d52f88e6..6c53b29254 100644
--- a/mmaction/models/__init__.py
+++ b/mmaction/models/__init__.py
@@ -8,5 +8,6 @@
 from .necks import *  # noqa: F401,F403
 from .recognizers import *  # noqa: F401,F403
 from .roi_heads import *  # noqa: F401,F403
+from .similarity import *  # noqa: F401,F403
 from .task_modules import *  # noqa: F401,F403
 from .utils import *  # noqa: F401,F403
diff --git a/mmaction/models/backbones/mobilenet_v2.py b/mmaction/models/backbones/mobilenet_v2.py
index f5a02ad60d..2ac5897645 100644
--- a/mmaction/models/backbones/mobilenet_v2.py
+++ b/mmaction/models/backbones/mobilenet_v2.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
 import torch.nn as nn
 import torch.utils.checkpoint as cp
 from mmcv.cnn import ConvModule
-from mmengine.logging import MMLogger
-from mmengine.model.weight_init import constant_init, kaiming_init
-from mmengine.runner import load_checkpoint
+from mmengine.model import BaseModule
 from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 
 from mmaction.registry import MODELS
@@ -19,9 +19,10 @@ def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
         value (int): The original channel number.
         divisor (int): The divisor to fully divide the channel number.
         min_value (int, optional): The minimum value of the output channel.
-            Default: None, means that the minimum value equal to the divisor.
+            Defaults to None, means that the minimum value equal to the
+            divisor.
         min_ratio (float, optional): The minimum ratio of the rounded channel
-            number to the original channel number. Default: 0.9.
+            number to the original channel number. Defaults to 0.9.
     Returns:
         int: The modified output channel number
     """
@@ -45,13 +46,13 @@ class InvertedResidual(nn.Module):
         expand_ratio (int): adjusts number of channels of the hidden layer
             in InvertedResidual by this amount.
         conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
+            Defaults to None, which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
+            Defaults to dict(type='BN').
         act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU6').
+            Defaults to dict(type='ReLU6').
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
+            memory while slowing down the training speed. Defaults to False.
     Returns:
         Tensor: The output tensor
     """
@@ -129,29 +130,34 @@ def _inner_forward(x):
 
 
 @MODELS.register_module()
-class MobileNetV2(nn.Module):
+class MobileNetV2(BaseModule):
     """MobileNetV2 backbone.
 
     Args:
-        pretrained (str | None): Name of pretrained model. Default: None.
+        pretrained (str | None): Name of pretrained model. Defaults to None.
         widen_factor (float): Width multiplier, multiply number of
-            channels in each layer by this amount. Default: 1.0.
+            channels in each layer by this amount. Defaults to 1.0.
         out_indices (None or Sequence[int]): Output from which stages.
-            Default: (7, ).
+            Defaults to (7, ).
         frozen_stages (int): Stages to be frozen (all param fixed). Note that
-            the last stage in ``MobileNetV2`` is ``conv2``. Default: -1,
+            the last stage in ``MobileNetV2`` is ``conv2``. Defaults to -1,
             which means not freezing any parameters.
         conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
+            Defaults to None, which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
+            Defaults to dict(type='BN').
         act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU6').
+            Defaults to dict(type='ReLU6').
         norm_eval (bool): Whether to set norm layers to eval mode, namely,
             freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
+            and its variants only. Defaults to False.
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+            dict(type='Kaiming', layer='Conv2d',),
+            dict(type='Constant', layer=['GroupNorm', '_BatchNorm'], val=1.)
+            ]``.
     """
 
     # Parameters to build layers. 4 parameters are needed to construct a
@@ -169,8 +175,17 @@ def __init__(self,
                  norm_cfg=dict(type='BN2d', requires_grad=True),
                  act_cfg=dict(type='ReLU6', inplace=True),
                  norm_eval=False,
-                 with_cp=False):
-        super().__init__()
+                 with_cp=False,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = [
+                     dict(type='Kaiming', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=['GroupNorm', '_BatchNorm'],
+                         val=1.)
+                 ]):
+        if pretrained is not None:
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        super().__init__(init_cfg=init_cfg)
         self.pretrained = pretrained
         self.widen_factor = widen_factor
         self.out_indices = out_indices
@@ -239,9 +254,9 @@ def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
         Args:
             out_channels (int): out_channels of block.
             num_blocks (int): number of blocks.
-            stride (int): stride of the first block. Default: 1
+            stride (int): stride of the first block. Defaults to 1
             expand_ratio (int): Expand the number of channels of the
-                hidden layer in InvertedResidual by this ratio. Default: 6.
+                hidden layer in InvertedResidual by this ratio. Defaults to 6.
         """
         layers = []
         for i in range(num_blocks):
@@ -261,21 +276,6 @@ def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
 
         return nn.Sequential(*layers)
 
-    def init_weights(self):
-        """Initiate the parameters either from existing checkpoint or from
-        scratch."""
-        if isinstance(self.pretrained, str):
-            logger = MMLogger.get_current_instance()
-            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
-        elif self.pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
     def forward(self, x):
         """Defines the computation performed at every call.
 
diff --git a/mmaction/models/backbones/mobilenet_v2_tsm.py b/mmaction/models/backbones/mobilenet_v2_tsm.py
index 2df95ab47c..dfdf9611b7 100644
--- a/mmaction/models/backbones/mobilenet_v2_tsm.py
+++ b/mmaction/models/backbones/mobilenet_v2_tsm.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.logging import MMLogger
+from mmengine.runner.checkpoint import _load_checkpoint
+
 from mmaction.registry import MODELS
 from .mobilenet_v2 import InvertedResidual, MobileNetV2
 from .resnet_tsm import TemporalShift
@@ -9,19 +12,26 @@ class MobileNetV2TSM(MobileNetV2):
     """MobileNetV2 backbone for TSM.
 
     Args:
-        num_segments (int): Number of frame segments. Default: 8.
+        num_segments (int): Number of frame segments. Defaults to 8.
         is_shift (bool): Whether to make temporal shift in reset layers.
-            Default: True.
-        shift_div (int): Number of div for shift. Default: 8.
+            Defaults to True.
+        shift_div (int): Number of div for shift. Defaults to 8.
+        pretraind2d (bool): Whether to load pretrained 2D model.
+            Defaults to True.
         **kwargs (keyword arguments, optional): Arguments for MobilNetV2.
     """
 
-    def __init__(self, num_segments=8, is_shift=True, shift_div=8, **kwargs):
+    def __init__(self,
+                 num_segments=8,
+                 is_shift=True,
+                 shift_div=8,
+                 pretrained2d=True,
+                 **kwargs):
         super().__init__(**kwargs)
         self.num_segments = num_segments
         self.is_shift = is_shift
         self.shift_div = shift_div
-        super().init_weights()
+        self.pretrained2d = pretrained2d
         self.init_structure()
 
     def make_temporal_shift(self):
@@ -41,5 +51,40 @@ def init_structure(self):
         if self.is_shift:
             self.make_temporal_shift()
 
+    def load_original_weights(self, logger):
+        original_state_dict = _load_checkpoint(
+            self.pretrained, map_location='cpu')
+        if 'state_dict' in original_state_dict:
+            original_state_dict = original_state_dict['state_dict']
+
+        wrapped_layers_map = dict()
+        for name, module in self.named_modules():
+            ori_name = name
+            for wrap_prefix in ['.net']:
+                if wrap_prefix in ori_name:
+                    ori_name = ori_name.replace(wrap_prefix, '')
+                    wrapped_layers_map[ori_name] = name
+
+        # convert wrapped keys
+        for param_name in list(original_state_dict.keys()):
+            layer_name = '.'.join(param_name.split('.')[:-1])
+            if layer_name in wrapped_layers_map:
+                wrapped_name = param_name.replace(
+                    layer_name, wrapped_layers_map[layer_name])
+                original_state_dict[wrapped_name] = original_state_dict.pop(
+                    param_name)
+
+        msg = self.load_state_dict(original_state_dict, strict=True)
+        logger.info(msg)
+
     def init_weights(self):
-        pass
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if self.pretrained2d:
+            logger = MMLogger.get_current_instance()
+            self.load_original_weights(logger)
+        else:
+            if self.pretrained:
+                self.init_cfg = dict(
+                    type='Pretrained', checkpoint=self.pretrained)
+            super().init_weights()
diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py
index ff694ca7b0..1923f62b3e 100644
--- a/mmaction/models/backbones/mvit.py
+++ b/mmaction/models/backbones/mvit.py
@@ -14,6 +14,7 @@
 from mmengine.utils import to_3tuple
 
 from mmaction.registry import MODELS
+from mmaction.utils import get_str_type
 from ..utils.embed import PatchEmbed3D
 
 
@@ -328,7 +329,7 @@ def init_weights(self) -> None:
         super().init_weights()
 
         if (isinstance(self.init_cfg, dict)
-                and self.init_cfg['type'] == 'Pretrained'):
+                and get_str_type(self.init_cfg['type']) == 'Pretrained'):
             # Suppress rel_pos_zero_init if use pretrained model.
             return
 
@@ -854,7 +855,7 @@ def init_weights(self, pretrained: Optional[str] = None) -> None:
             super().init_weights()
 
             if (isinstance(self.init_cfg, dict)
-                    and self.init_cfg['type'] == 'Pretrained'):
+                    and get_str_type(self.init_cfg['type']) == 'Pretrained'):
                 # Suppress default init if use pretrained model.
                 return
 
diff --git a/mmaction/models/backbones/resnet.py b/mmaction/models/backbones/resnet.py
index c599bcc311..5e10d67703 100644
--- a/mmaction/models/backbones/resnet.py
+++ b/mmaction/models/backbones/resnet.py
@@ -1,14 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections import OrderedDict
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import mmengine
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 from mmengine.logging import MMLogger
-from mmengine.model.weight_init import constant_init, kaiming_init
-from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint
+from mmengine.model import BaseModule
+from mmengine.runner.checkpoint import _load_checkpoint
 from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 from torch.utils import checkpoint as cp
 
@@ -306,7 +306,7 @@ def make_res_layer(block: nn.Module,
 
 
 @MODELS.register_module()
-class ResNet(nn.Module):
+class ResNet(BaseModule):
     """ResNet backbone.
 
     Args:
@@ -339,6 +339,11 @@ class ResNet(nn.Module):
         partial_bn (bool): Whether to use partial bn. Defaults to False.
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
             memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict or list[dict]): Initialization config dict. Defaults to
+            ``[
+            dict(type='Kaiming', layer='Conv2d',),
+            dict(type='Constant', layer='BatchNorm', val=1.)
+            ]``.
     """
 
     arch_settings = {
@@ -349,24 +354,30 @@ class ResNet(nn.Module):
         152: (Bottleneck, (3, 8, 36, 3))
     }
 
-    def __init__(self,
-                 depth: int,
-                 pretrained: Optional[str] = None,
-                 torchvision_pretrain: bool = True,
-                 in_channels: int = 3,
-                 num_stages: int = 4,
-                 out_indices: Sequence[int] = (3, ),
-                 strides: Sequence[int] = (1, 2, 2, 2),
-                 dilations: Sequence[int] = (1, 1, 1, 1),
-                 style: str = 'pytorch',
-                 frozen_stages: int = -1,
-                 conv_cfg: ConfigType = dict(type='Conv'),
-                 norm_cfg: ConfigType = dict(type='BN2d', requires_grad=True),
-                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
-                 norm_eval: bool = False,
-                 partial_bn: bool = False,
-                 with_cp: bool = False) -> None:
-        super().__init__()
+    def __init__(
+        self,
+        depth: int,
+        pretrained: Optional[str] = None,
+        torchvision_pretrain: bool = True,
+        in_channels: int = 3,
+        num_stages: int = 4,
+        out_indices: Sequence[int] = (3, ),
+        strides: Sequence[int] = (1, 2, 2, 2),
+        dilations: Sequence[int] = (1, 1, 1, 1),
+        style: str = 'pytorch',
+        frozen_stages: int = -1,
+        conv_cfg: ConfigType = dict(type='Conv'),
+        norm_cfg: ConfigType = dict(type='BN2d', requires_grad=True),
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        norm_eval: bool = False,
+        partial_bn: bool = False,
+        with_cp: bool = False,
+        init_cfg: Optional[Union[Dict, List[Dict]]] = [
+            dict(type='Kaiming', layer='Conv2d'),
+            dict(type='Constant', layer='BatchNorm2d', val=1.)
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
         if depth not in self.arch_settings:
             raise KeyError(f'invalid depth {depth} for resnet')
         self.depth = depth
@@ -540,14 +551,12 @@ def init_weights(self) -> None:
                 self._load_torchvision_checkpoint(logger)
             else:
                 # ours
-                load_checkpoint(
-                    self, self.pretrained, strict=False, logger=logger)
+                if self.pretrained:
+                    self.init_cfg = dict(
+                        type='Pretrained', checkpoint=self.pretrained)
+                    super().init_weights()
         elif self.pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, nn.BatchNorm2d):
-                    constant_init(m, 1)
+            super().init_weights()
         else:
             raise TypeError('pretrained must be a str or None')
 
diff --git a/mmaction/models/backbones/resnet2plus1d.py b/mmaction/models/backbones/resnet2plus1d.py
index 0656c4b126..36cc50528c 100644
--- a/mmaction/models/backbones/resnet2plus1d.py
+++ b/mmaction/models/backbones/resnet2plus1d.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmaction.registry import MODELS
+from mmaction.utils import get_str_type
 from .resnet3d import ResNet3d
 
 
@@ -14,7 +15,7 @@ class ResNet2Plus1d(ResNet3d):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         assert self.pretrained2d is False
-        assert self.conv_cfg['type'] == 'Conv2plus1d'
+        assert get_str_type(self.conv_cfg['type']) == 'Conv2plus1d'
 
     def _freeze_stages(self):
         """Prevent all the parameters from being optimized before
diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py
index c639e1eae6..0079c96cb7 100644
--- a/mmaction/models/backbones/resnet_tsm.py
+++ b/mmaction/models/backbones/resnet_tsm.py
@@ -1,7 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
-from mmcv.cnn import NonLocal3d
+from mmcv.cnn import ConvModule, NonLocal3d
+from mmengine.logging import MMLogger
+from mmengine.runner.checkpoint import _load_checkpoint
 from torch.nn.modules.utils import _ntuple
 
 from mmaction.registry import MODELS
@@ -128,21 +130,26 @@ class ResNetTSM(ResNet):
     """ResNet backbone for TSM.
 
     Args:
-        num_segments (int): Number of frame segments. Default: 8.
+        num_segments (int): Number of frame segments. Defaults to 8.
         is_shift (bool): Whether to make temporal shift in reset layers.
-            Default: True.
+            Defaults to True.
         non_local (Sequence[int]): Determine whether to apply non-local module
-            in the corresponding block of each stages. Default: (0, 0, 0, 0).
-        non_local_cfg (dict): Config for non-local module. Default: ``dict()``.
-        shift_div (int): Number of div for shift. Default: 8.
+            in the corresponding block of each stages.
+            Defaults to (0, 0, 0, 0).
+        non_local_cfg (dict): Config for non-local module.
+            Defaults to ``dict()``.
+        shift_div (int): Number of div for shift. Defaults to 8.
         shift_place (str): Places in resnet layers for shift, which is chosen
             from ['block', 'blockres'].
             If set to 'block', it will apply temporal shift to all child blocks
             in each resnet layer.
             If set to 'blockres', it will apply temporal shift to each `conv1`
             layer of all child blocks in each resnet layer.
-            Default: 'blockres'.
-        temporal_pool (bool): Whether to add temporal pooling. Default: False.
+            Defaults to 'blockres'.
+        temporal_pool (bool): Whether to add temporal pooling.
+            Defaults to False.
+        pretrained2d (bool): Whether to load pretrained 2D model.
+            Defaults to True.
         **kwargs (keyword arguments, optional): Arguments for ResNet.
     """
 
@@ -155,6 +162,7 @@ def __init__(self,
                  shift_div=8,
                  shift_place='blockres',
                  temporal_pool=False,
+                 pretrained2d=True,
                  **kwargs):
         super().__init__(depth, **kwargs)
         self.num_segments = num_segments
@@ -165,8 +173,7 @@ def __init__(self,
         self.non_local = non_local
         self.non_local_stages = _ntuple(self.num_stages)(non_local)
         self.non_local_cfg = non_local_cfg
-        # TODO use convert key to load weights
-        super().init_weights()
+        self.pretrained2d = pretrained2d
         self.init_structure()
 
     def init_structure(self):
@@ -298,6 +305,67 @@ def make_non_local(self):
                                                  self.num_segments,
                                                  self.non_local_cfg)
 
+    def load_original_weights(self, logger):
+        """Load weights from original checkpoint, which required converting
+        keys."""
+        state_dict_torchvision = _load_checkpoint(
+            self.pretrained, map_location='cpu')
+        if 'state_dict' in state_dict_torchvision:
+            state_dict_torchvision = state_dict_torchvision['state_dict']
+
+        wrapped_layers_map = dict()
+        for name, module in self.named_modules():
+            # convert torchvision keys
+            ori_name = name
+            for wrap_prefix in ['.net', '.block']:
+                if wrap_prefix in ori_name:
+                    ori_name = ori_name.replace(wrap_prefix, '')
+                    wrapped_layers_map[ori_name] = name
+
+            if isinstance(module, ConvModule):
+                if 'downsample' in ori_name:
+                    # layer{X}.{Y}.downsample.conv->layer{X}.{Y}.downsample.0
+                    tv_conv_name = ori_name + '.0'
+                    # layer{X}.{Y}.downsample.bn->layer{X}.{Y}.downsample.1
+                    tv_bn_name = ori_name + '.1'
+                else:
+                    # layer{X}.{Y}.conv{n}.conv->layer{X}.{Y}.conv{n}
+                    tv_conv_name = ori_name
+                    # layer{X}.{Y}.conv{n}.bn->layer{X}.{Y}.bn{n}
+                    tv_bn_name = ori_name.replace('conv', 'bn')
+
+                for conv_param in ['.weight', '.bias']:
+                    if tv_conv_name + conv_param in state_dict_torchvision:
+                        state_dict_torchvision[ori_name+'.conv'+conv_param] = \
+                            state_dict_torchvision.pop(tv_conv_name+conv_param)
+
+                for bn_param in [
+                        '.weight', '.bias', '.running_mean', '.running_var'
+                ]:
+                    if tv_bn_name + bn_param in state_dict_torchvision:
+                        state_dict_torchvision[ori_name+'.bn'+bn_param] = \
+                            state_dict_torchvision.pop(tv_bn_name+bn_param)
+
+        # convert wrapped keys
+        for param_name in list(state_dict_torchvision.keys()):
+            layer_name = '.'.join(param_name.split('.')[:-1])
+            if layer_name in wrapped_layers_map:
+                wrapped_name = param_name.replace(
+                    layer_name, wrapped_layers_map[layer_name])
+                state_dict_torchvision[
+                    wrapped_name] = state_dict_torchvision.pop(param_name)
+
+        msg = self.load_state_dict(state_dict_torchvision, strict=False)
+        logger.info(msg)
+
     def init_weights(self):
-        """Initialize weights."""
-        pass
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        if self.pretrained2d:
+            logger = MMLogger.get_current_instance()
+            self.load_original_weights(logger)
+        else:
+            if self.pretrained:
+                self.init_cfg = dict(
+                    type='Pretrained', checkpoint=self.pretrained)
+            super().init_weights()
diff --git a/mmaction/models/backbones/vit_mae.py b/mmaction/models/backbones/vit_mae.py
index 21ff837724..31210beba2 100644
--- a/mmaction/models/backbones/vit_mae.py
+++ b/mmaction/models/backbones/vit_mae.py
@@ -7,12 +7,17 @@
 from mmcv.cnn.bricks import DropPath
 from mmcv.cnn.bricks.transformer import FFN, PatchEmbed
 from mmengine.model import BaseModule, ModuleList
-from mmengine.utils import to_2tuple
 from torch import Tensor, nn
 
 from mmaction.registry import MODELS
 from mmaction.utils import ConfigType, OptConfigType
 
+try:
+    from mmdet.registry import MODELS as MMDET_MODELS
+    mmdet_imported = True
+except (ImportError, ModuleNotFoundError):
+    mmdet_imported = False
+
 
 class Attention(BaseModule):
     """Multi-head Self-attention.
@@ -246,6 +251,8 @@ class VisionTransformer(BaseModule):
         use_mean_pooling (bool): If True, take the mean pooling over all
             positions. Defaults to True.
         pretrained (str, optional): Name of pretrained model. Default: None.
+        return_feat_map (bool): If True, return the feature in the shape of
+            `[B, C, T, H, W]`. Defaults to False.
         init_cfg (dict or list[dict]): Initialization config dict. Defaults to
             ``[
             dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
@@ -273,6 +280,7 @@ def __init__(self,
                  tubelet_size: int = 2,
                  use_mean_pooling: int = True,
                  pretrained: Optional[str] = None,
+                 return_feat_map: bool = False,
                  init_cfg: Optional[Union[Dict, List[Dict]]] = [
                      dict(
                          type='TruncNormal', layer='Linear', std=0.02,
@@ -285,21 +293,21 @@ def __init__(self,
             self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
         super().__init__(init_cfg=init_cfg)
 
-        patch_size = to_2tuple(patch_size)
-        img_size = to_2tuple(img_size)
+        self.embed_dims = embed_dims
+        self.patch_size = patch_size
 
         self.patch_embed = PatchEmbed(
             in_channels=in_channels,
             embed_dims=embed_dims,
             conv_type='Conv3d',
-            kernel_size=(tubelet_size, ) + patch_size,
-            stride=(tubelet_size, ) + patch_size,
+            kernel_size=(tubelet_size, patch_size, patch_size),
+            stride=(tubelet_size, patch_size, patch_size),
             padding=(0, 0, 0),
             dilation=(1, 1, 1))
 
-        num_patches = (img_size[1] // patch_size[1]) * \
-                      (img_size[0] // patch_size[0]) * \
-                      (num_frames // tubelet_size)
+        grid_size = img_size // patch_size
+        num_patches = grid_size**2 * (num_frames // tubelet_size)
+        self.grid_size = (grid_size, grid_size)
 
         if use_learnable_pos_emb:
             self.pos_embed = nn.Parameter(
@@ -336,6 +344,8 @@ def __init__(self,
             self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
             self.fc_norm = None
 
+        self.return_feat_map = return_feat_map
+
     def forward(self, x: Tensor) -> Tensor:
         """Defines the computation performed at every call.
 
@@ -345,17 +355,39 @@ def forward(self, x: Tensor) -> Tensor:
             Tensor: The feature of the input
                 samples extracted by the backbone.
         """
+        b, _, _, h, w = x.shape
+        h //= self.patch_size
+        w //= self.patch_size
         x = self.patch_embed(x)[0]
-        B, _, _ = x.size()
+        if (h, w) != self.grid_size:
+            pos_embed = self.pos_embed.reshape(-1, *self.grid_size,
+                                               self.embed_dims)
+            pos_embed = pos_embed.permute(0, 3, 1, 2)
+            pos_embed = F.interpolate(
+                pos_embed, size=(h, w), mode='bicubic', align_corners=False)
+            pos_embed = pos_embed.permute(0, 2, 3, 1).flatten(1, 2)
+            pos_embed = pos_embed.reshape(1, -1, self.embed_dims)
+        else:
+            pos_embed = self.pos_embed
 
-        x = x + self.pos_embed
+        x = x + pos_embed
         x = self.pos_drop(x)
 
         for blk in self.blocks:
             x = blk(x)
 
         x = self.norm(x)
+
+        if self.return_feat_map:
+            x = x.reshape(b, -1, h, w, self.embed_dims)
+            x = x.permute(0, 4, 1, 2, 3)
+            return x
+
         if self.fc_norm is not None:
             return self.fc_norm(x.mean(1))
-        else:
-            return x[:, 0]
+
+        return x[:, 0]
+
+
+if mmdet_imported:
+    MMDET_MODELS.register_module()(VisionTransformer)
diff --git a/mmaction/models/data_preprocessors/data_preprocessor.py b/mmaction/models/data_preprocessors/data_preprocessor.py
index 5a11eefd3b..891cb8f386 100644
--- a/mmaction/models/data_preprocessors/data_preprocessor.py
+++ b/mmaction/models/data_preprocessors/data_preprocessor.py
@@ -5,7 +5,7 @@
 from mmengine.model import BaseDataPreprocessor, stack_batch
 
 from mmaction.registry import MODELS
-from mmaction.utils.typing import SampleList
+from mmaction.utils import SampleList
 
 
 @MODELS.register_module()
@@ -19,6 +19,8 @@ class ActionDataPreprocessor(BaseDataPreprocessor):
             of channels of images or stacked optical flow. Defaults to None.
         to_rgb (bool): Whether to convert image from BGR to RGB.
             Defaults to False.
+        to_float32 (bool): Whether to convert data to float32.
+            Defaults to True.
         blending (dict, optional): Config for batch blending.
             Defaults to None.
         format_shape (str): Format shape of input data.
@@ -29,10 +31,12 @@ def __init__(self,
                  mean: Optional[Sequence[Union[float, int]]] = None,
                  std: Optional[Sequence[Union[float, int]]] = None,
                  to_rgb: bool = False,
+                 to_float32: bool = True,
                  blending: Optional[dict] = None,
                  format_shape: str = 'NCHW') -> None:
         super().__init__()
         self.to_rgb = to_rgb
+        self.to_float32 = to_float32
         self.format_shape = format_shape
 
         if mean is not None:
@@ -98,8 +102,7 @@ def forward_onesample(self, data, training: bool = False) -> dict:
             training (bool): Whether to enable training time augmentation.
 
         Returns:
-            dict: Data in the same format as the model
-                input.
+            dict: Data in the same format as the model input.
         """
         inputs, data_samples = data['inputs'], data['data_samples']
         inputs, data_samples = self.preprocess(inputs, data_samples, training)
@@ -139,7 +142,7 @@ def preprocess(self,
                 mean = self.mean.view(view_shape)
                 std = self.std.view(view_shape)
                 batch_inputs = (batch_inputs - mean) / std
-        else:
+        elif self.to_float32:
             batch_inputs = batch_inputs.to(torch.float32)
 
         # ----- Blending -----
diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py
index 5a1b74a9f8..ee96ba8994 100644
--- a/mmaction/models/heads/__init__.py
+++ b/mmaction/models/heads/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import BaseHead
+from .feature_head import FeatureHead
 from .gcn_head import GCNHead
 from .i3d_head import I3DHead
 from .mvit_head import MViTHead
@@ -18,5 +19,5 @@
 __all__ = [
     'BaseHead', 'GCNHead', 'I3DHead', 'MViTHead', 'OmniHead', 'SlowFastHead',
     'TPNHead', 'TRNHead', 'TSMHead', 'TSNAudioHead', 'TSNHead',
-    'TimeSformerHead', 'UniFormerHead', 'RGBPoseHead', 'X3DHead'
+    'TimeSformerHead', 'UniFormerHead', 'RGBPoseHead', 'X3DHead', 'FeatureHead'
 ]
diff --git a/mmaction/models/heads/base.py b/mmaction/models/heads/base.py
index b4d0d3fd28..c39da5aa9a 100644
--- a/mmaction/models/heads/base.py
+++ b/mmaction/models/heads/base.py
@@ -216,7 +216,8 @@ class score. Only called in test mode.
                              f'["score", "prob", None]')
 
         batch_size = cls_scores.shape[0]
-        cls_scores = cls_scores.view(batch_size // num_segs, num_segs, -1)
+        cls_scores = cls_scores.view((batch_size // num_segs, num_segs) +
+                                     cls_scores.shape[1:])
 
         if self.average_clips is None:
             return cls_scores
diff --git a/mmaction/models/heads/feature_head.py b/mmaction/models/heads/feature_head.py
new file mode 100644
index 0000000000..5714fddb33
--- /dev/null
+++ b/mmaction/models/heads/feature_head.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from mmaction.registry import MODELS
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class FeatureHead(BaseHead):
+    """General head for feature extraction.
+
+    Args:
+        spatial_type (str, optional): Pooling type in spatial dimension.
+            Default: 'avg'. If set to None, means keeping spatial dimension,
+            and for GCN backbone, keeping last two dimension(T, V).
+        temporal_type (str, optional): Pooling type in temporal dimension.
+            Default: 'avg'. If set to None, meanse keeping temporal dimnsion,
+            and for GCN backbone, keeping dimesion M. Please note that the
+            channel order would keep same with the output of backbone,
+            [N, T, C, H, W] for 2D recognizer, and [N, M, C, T, V] for GCN
+            recognizer.
+        backbone_name (str, optional): Backbone name to specifying special
+            operations.Currently supports: `'tsm'`, `'slowfast'`, and `'gcn'`.
+            Defaults to None, means take the input as normal feature.
+        num_segments (int, optional): Number of frame segments for TSM
+            backbone. Defaults to None.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 spatial_type: str = 'avg',
+                 temporal_type: str = 'avg',
+                 backbone_name: Optional[str] = None,
+                 num_segments: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(None, None, **kwargs)
+
+        self.temporal_type = temporal_type
+        self.backbone_name = backbone_name
+        self.num_segments = num_segments
+        if spatial_type == 'avg':
+            self.pool2d = torch.mean
+        elif spatial_type == 'max':
+            self.pool2d = torch.max
+        elif spatial_type is None:
+            self.pool2d = lambda x, dim: x
+        else:
+            raise NotImplementedError(
+                f'Unsupported spatial_type {spatial_type}')
+
+        if temporal_type == 'avg':
+            self.pool1d = torch.mean
+        elif temporal_type == 'max':
+            self.pool1d = torch.max
+        elif temporal_type is None:
+            self.pool1d = lambda x, dim: x
+        else:
+            raise NotImplementedError(
+                f'Unsupported temporal_type {temporal_type}')
+
+    def forward(self,
+                x: Tensor,
+                num_segs: Optional[int] = None,
+                **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+            num_segs (int): For 2D backbone. Number of segments into which
+                a video is divided. Defaults to None.
+        Returns:
+            Tensor: The output features after pooling.
+        """
+        if isinstance(x, Tensor):
+            n_dims = x.ndim
+        elif isinstance(x, tuple):
+            n_dims = x[0].ndim
+            assert self.backbone_name == 'slowfast', \
+                'Only support SlowFast backbone to input tuple'
+        else:
+            raise NotImplementedError(f'Unsupported feature type: {type(x)}')
+        # For 2D backbone with spatial dimension
+        if n_dims == 4:
+            assert num_segs is not None
+            if self.backbone_name == 'tsm':
+                assert self.num_segments is not None, \
+                    'Please Specify num_segments for TSM'
+                num_segs = self.num_segments
+            # [N, T, channels, H, W]
+            x = x.view((-1, num_segs) + x.shape[1:])
+            feat = self.pool1d(self.pool2d(x, dim=[-2, -1]), dim=1)
+
+        elif n_dims == 5:
+            if self.backbone_name == 'slowfast':
+                x_slow, x_fast = x
+                assert self.temporal_type is not None, \
+                    'slowfast backbone has to pool temporal dimension'
+                x_fast = self.pool1d(self.pool2d(x_fast, dim=[-2, -1]), dim=2)
+                x_slow = self.pool1d(self.pool2d(x_slow, dim=[-2, -1]), dim=2)
+                feat = torch.cat((x_slow, x_fast), dim=1)
+
+            # For GCN-based backbone
+            elif self.backbone_name == 'gcn':
+                # N, M, C, T, V
+                feat = self.pool1d(self.pool2d(x, dim=[-2, -1]), dim=1)
+            # For 3D backbone with spatial dimension
+            else:
+                # [N, channels, T, H, W]
+                feat = self.pool1d(self.pool2d(x, dim=[-2, -1]), dim=2)
+        # For backbone output feature without spatial and temporal dimension
+        elif n_dims == 2:
+            # [N, channels]
+            feat = x
+
+        return feat
+
+    def predict_by_feat(self, feats: Union[Tensor, Tuple[Tensor]],
+                        data_samples) -> Tensor:
+        """Integrate multi-view features into one tensor.
+
+        Args:
+            feats (torch.Tensor | tuple[torch.Tensor]): Features from
+                upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            Tensor: The integrated multi-view features.
+        """
+        num_segs = feats.shape[0] // len(data_samples)
+        feats = self.average_clip(feats, num_segs=num_segs)
+
+        return feats
diff --git a/mmaction/models/heads/slowfast_head.py b/mmaction/models/heads/slowfast_head.py
index 47ce3740ce..9478e2c652 100644
--- a/mmaction/models/heads/slowfast_head.py
+++ b/mmaction/models/heads/slowfast_head.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch
 from mmengine.model.weight_init import normal_init
 from torch import Tensor, nn
@@ -53,22 +55,22 @@ def init_weights(self) -> None:
         """Initiate the parameters from scratch."""
         normal_init(self.fc_cls, std=self.init_std)
 
-    def forward(self, x: Tensor, **kwargs) -> None:
+    def forward(self, x: Tuple[Tensor], **kwargs) -> None:
         """Defines the computation performed at every call.
 
         Args:
-            x (Tensor): The input data.
+            x (tuple[torch.Tensor]): The input data.
 
         Returns:
             Tensor: The classification scores for input samples.
         """
-        # ([N, channel_fast, T, H, W], [(N, channel_slow, T, H, W)])
-        x_fast, x_slow = x
-        # ([N, channel_fast, 1, 1, 1], [N, channel_slow, 1, 1, 1])
-        x_fast = self.avg_pool(x_fast)
+        # ([N, channel_slow, T1, H, W], [(N, channel_fast, T2, H, W)])
+        x_slow, x_fast = x
+        # ([N, channel_slow, 1, 1, 1], [N, channel_fast, 1, 1, 1])
         x_slow = self.avg_pool(x_slow)
+        x_fast = self.avg_pool(x_fast)
         # [N, channel_fast + channel_slow, 1, 1, 1]
-        x = torch.cat((x_slow, x_fast), dim=1)
+        x = torch.cat((x_fast, x_slow), dim=1)
 
         if self.dropout is not None:
             x = self.dropout(x)
diff --git a/mmaction/models/heads/tpn_head.py b/mmaction/models/heads/tpn_head.py
index dfd9a1f699..6f32f65109 100644
--- a/mmaction/models/heads/tpn_head.py
+++ b/mmaction/models/heads/tpn_head.py
@@ -53,7 +53,8 @@ def forward(self,
                 x = self.avg_pool3d(x)
             if self.new_cls is None:
                 self._init_new_cls()
-            cls_score_feat_map = self.new_cls(x)
+            x = self.new_cls(x)
+            cls_score_feat_map = x.view(x.size(0), -1)
             return cls_score_feat_map
 
         if self.avg_pool2d is None:
diff --git a/mmaction/models/heads/tsm_head.py b/mmaction/models/heads/tsm_head.py
index 0f4c71ba53..3f53a11854 100644
--- a/mmaction/models/heads/tsm_head.py
+++ b/mmaction/models/heads/tsm_head.py
@@ -4,7 +4,7 @@
 from torch import Tensor, nn
 
 from mmaction.registry import MODELS
-from mmaction.utils import ConfigType
+from mmaction.utils import ConfigType, get_str_type
 from .base import AvgConsensus, BaseHead
 
 
@@ -54,7 +54,7 @@ def __init__(self,
         consensus_ = consensus.copy()
 
         consensus_type = consensus_.pop('type')
-        if consensus_type == 'AvgConsensus':
+        if get_str_type(consensus_type) == 'AvgConsensus':
             self.consensus = AvgConsensus(**consensus_)
         else:
             self.consensus = None
diff --git a/mmaction/models/heads/tsn_head.py b/mmaction/models/heads/tsn_head.py
index 167929417e..a28fd7919d 100644
--- a/mmaction/models/heads/tsn_head.py
+++ b/mmaction/models/heads/tsn_head.py
@@ -3,7 +3,7 @@
 from torch import Tensor, nn
 
 from mmaction.registry import MODELS
-from mmaction.utils import ConfigType
+from mmaction.utils import ConfigType, get_str_type
 from .base import AvgConsensus, BaseHead
 
 
@@ -43,7 +43,7 @@ def __init__(self,
         consensus_ = consensus.copy()
 
         consensus_type = consensus_.pop('type')
-        if consensus_type == 'AvgConsensus':
+        if get_str_type(consensus_type) == 'AvgConsensus':
             self.consensus = AvgConsensus(**consensus_)
         else:
             self.consensus = None
diff --git a/mmaction/models/heads/uniformer_head.py b/mmaction/models/heads/uniformer_head.py
index e83b552b93..3b9c661b81 100644
--- a/mmaction/models/heads/uniformer_head.py
+++ b/mmaction/models/heads/uniformer_head.py
@@ -7,7 +7,7 @@
 from torch import Tensor, nn
 
 from mmaction.registry import MODELS
-from mmaction.utils import ConfigType
+from mmaction.utils import ConfigType, get_str_type
 from .base import BaseHead
 
 
@@ -66,7 +66,7 @@ def _select_channels(self, stact_dict):
 
     def init_weights(self) -> None:
         """Initiate the parameters from scratch."""
-        if self.init_cfg['type'] == 'Pretrained':
+        if get_str_type(self.init_cfg['type']) == 'Pretrained':
             assert self.channel_map is not None, \
                 'load cls_head weights needs to specify the channel map file'
             logger = MMLogger.get_current_instance()
diff --git a/mmaction/models/localizers/__init__.py b/mmaction/models/localizers/__init__.py
index eb775a2461..26e016410b 100644
--- a/mmaction/models/localizers/__init__.py
+++ b/mmaction/models/localizers/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .bmn import BMN
 from .bsn import PEM, TEM
+from .tcanet import TCANet
 
-__all__ = ['TEM', 'PEM', 'BMN']
+__all__ = ['TEM', 'PEM', 'BMN', 'TCANet']
diff --git a/mmaction/models/localizers/bsn.py b/mmaction/models/localizers/bsn.py
index c35e3d9bca..a2b61a37cb 100644
--- a/mmaction/models/localizers/bsn.py
+++ b/mmaction/models/localizers/bsn.py
@@ -19,6 +19,7 @@ class TEM(BaseModel):
     Code reference
     https://github.com/wzmsltw/BSN-boundary-sensitive-network
     Args:
+        temporal_dim (int): Total frames selected for each video.
         tem_feat_dim (int): Feature dimension.
         tem_hidden_dim (int): Hidden layer dimension.
         tem_match_threshold (float): Temporal evaluation match threshold.
diff --git a/mmaction/models/localizers/tcanet.py b/mmaction/models/localizers/tcanet.py
new file mode 100644
index 0000000000..a1c20772f7
--- /dev/null
+++ b/mmaction/models/localizers/tcanet.py
@@ -0,0 +1,513 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine.model import BaseModel
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import OptConfigType
+from .utils import (batch_iou, bbox_se_transform_batch, bbox_se_transform_inv,
+                    bbox_xw_transform_batch, bbox_xw_transform_inv,
+                    post_processing)
+
+
+class LGTE(BaseModel):
+    """Local-Global Temporal Encoder (LGTE)
+
+    Args:
+        input_dim (int): Input feature dimension.
+        dropout (float): the dropout rate for the residual branch of
+            self-attention and ffn.
+        temporal_dim (int): Total frames selected for each video.
+            Defaults to 100.
+        window_size (int): the window size for Local Temporal Encoder.
+            Defaults to 9.
+        init_cfg (dict or ConfigDict, optional): The Config for
+            initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 input_dim: int,
+                 dropout: float,
+                 temporal_dim: int = 100,
+                 window_size: int = 9,
+                 num_heads: int = 8,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super(LGTE, self).__init__(init_cfg)
+
+        self.atten = MultiheadAttention(
+            embed_dims=input_dim,
+            num_heads=num_heads,
+            proj_drop=dropout,
+            attn_drop=0.1)
+        self.ffn = FFN(
+            embed_dims=input_dim, feedforward_channels=256, ffn_drop=dropout)
+
+        norm_cfg = dict(type='LN', eps=1e-6)
+        self.norm1 = build_norm_layer(norm_cfg, input_dim)[1]
+        self.norm2 = build_norm_layer(norm_cfg, input_dim)[1]
+
+        mask = self._mask_matrix(num_heads, temporal_dim, window_size)
+        self.register_buffer('mask', mask)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward call for LGTE.
+
+        Args:
+            x (torch.Tensor): The input tensor with shape (B, C, L)
+        """
+        x = x.permute(2, 0, 1)
+        mask = self.mask.repeat(x.size(1), 1, 1, 1)
+        L = x.shape[0]
+        x = self.atten(x, attn_mask=mask.reshape(-1, L, L))
+        x = self.norm1(x)
+        x = self.ffn(x)
+        x = self.norm2(x)
+        x = x.permute(1, 2, 0)
+        return x
+
+    @staticmethod
+    def _mask_matrix(num_heads: int, temporal_dim: int,
+                     window_size: int) -> Tensor:
+        mask = torch.zeros(num_heads, temporal_dim, temporal_dim)
+        index = torch.arange(temporal_dim)
+
+        for i in range(num_heads // 2):
+            for j in range(temporal_dim):
+                ignored = (index - j).abs() > window_size / 2
+                mask[i, j] = ignored
+
+        return mask.unsqueeze(0).bool()
+
+
+def StartEndRegressor(sample_num: int, feat_dim: int) -> nn.Module:
+    """Start and End Regressor in the Temporal Boundary Regressor.
+
+    Args:
+        sample_num (int): number of samples for the start & end.
+        feat_dim (int): feature dimension.
+
+    Returns:
+        A pytorch module that works as the start and end regressor. The input
+        of the module should have a shape of (B, feat_dim * 2, sample_num).
+    """
+    hidden_dim = 128
+    regressor = nn.Sequential(
+        nn.Conv1d(
+            feat_dim * 2,
+            hidden_dim * 2,
+            kernel_size=3,
+            padding=1,
+            groups=8,
+            stride=2), nn.ReLU(inplace=True),
+        nn.Conv1d(
+            hidden_dim * 2,
+            hidden_dim * 2,
+            kernel_size=3,
+            padding=1,
+            groups=8,
+            stride=2), nn.ReLU(inplace=True),
+        nn.Conv1d(hidden_dim * 2, 2, kernel_size=sample_num // 4, groups=2),
+        nn.Flatten())
+    return regressor
+
+
+def CenterWidthRegressor(temporal_len: int, feat_dim: int) -> nn.Module:
+    """Center Width in the Temporal Boundary Regressor.
+
+    Args:
+        temporal_len (int): temporal dimension of the inputs.
+        feat_dim (int): feature dimension.
+
+    Returns:
+        A pytorch module that works as the start and end regressor. The input
+        of the module should have a shape of (B, feat_dim, temporal_len).
+    """
+    hidden_dim = 512
+    regressor = nn.Sequential(
+        nn.Conv1d(
+            feat_dim, hidden_dim, kernel_size=3, padding=1, groups=4,
+            stride=2), nn.ReLU(inplace=True),
+        nn.Conv1d(
+            hidden_dim,
+            hidden_dim,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            stride=2), nn.ReLU(inplace=True),
+        nn.Conv1d(
+            hidden_dim, hidden_dim, kernel_size=temporal_len // 4, groups=4),
+        nn.ReLU(inplace=True), nn.Conv1d(hidden_dim, 3, kernel_size=1))
+    return regressor
+
+
+class TemporalTransform:
+    """Temporal Transform to sample temporal features."""
+
+    def __init__(self, prop_boundary_ratio: float, action_sample_num: int,
+                 se_sample_num: int, temporal_interval: int):
+        super(TemporalTransform, self).__init__()
+        self.temporal_interval = temporal_interval
+        self.prop_boundary_ratio = prop_boundary_ratio
+        self.action_sample_num = action_sample_num
+        self.se_sample_num = se_sample_num
+
+    def __call__(self, segments: Tensor, features: Tensor) -> List[Tensor]:
+        s_len = segments[:, 1] - segments[:, 0]
+        starts_segments = [
+            segments[:, 0] - self.prop_boundary_ratio * s_len, segments[:, 0]
+        ]
+        starts_segments = torch.stack(starts_segments, dim=1)
+
+        ends_segments = [
+            segments[:, 1], segments[:, 1] + self.prop_boundary_ratio * s_len
+        ]
+        ends_segments = torch.stack(ends_segments, dim=1)
+
+        starts_feature = self._sample_one_temporal(starts_segments,
+                                                   self.se_sample_num,
+                                                   features)
+        ends_feature = self._sample_one_temporal(ends_segments,
+                                                 self.se_sample_num, features)
+        actions_feature = self._sample_one_temporal(segments,
+                                                    self.action_sample_num,
+                                                    features)
+        return starts_feature, actions_feature, ends_feature
+
+    def _sample_one_temporal(self, segments: Tensor, out_len: int,
+                             features: Tensor) -> Tensor:
+        segments = segments.clamp(0, 1) * 2 - 1
+        theta = segments.new_zeros((features.size(0), 2, 3))
+        theta[:, 1, 1] = 1.0
+        theta[:, 0, 0] = (segments[:, 1] - segments[:, 0]) / 2.0
+        theta[:, 0, 2] = (segments[:, 1] + segments[:, 0]) / 2.0
+
+        size = torch.Size((*features.shape[:2], 1, out_len))
+        grid = F.affine_grid(theta, size)
+        stn_feature = F.grid_sample(features.unsqueeze(2), grid)
+        stn_feature = stn_feature.view(*features.shape[:2], out_len)
+        return stn_feature
+
+
+class TBR(BaseModel):
+    """Temporal Boundary Regressor (TBR)"""
+
+    def __init__(self,
+                 se_sample_num: int,
+                 action_sample_num: int,
+                 temporal_dim: int,
+                 prop_boundary_ratio: float = 0.5,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super(TBR, self).__init__(init_cfg)
+
+        hidden_dim = 512
+
+        self.reg1se = StartEndRegressor(se_sample_num, hidden_dim)
+        temporal_len = se_sample_num * 2 + action_sample_num
+        self.reg1xw = CenterWidthRegressor(temporal_len, hidden_dim)
+        self.ttn = TemporalTransform(prop_boundary_ratio, action_sample_num,
+                                     se_sample_num, temporal_dim)
+
+    def forward(self, proposals: Tensor, features: Tensor, gt_boxes: Tensor,
+                iou_thres: float, training: bool) -> tuple:
+        proposals1 = proposals[:, :2]
+        starts_feat1, actions_feat1, ends_feat1 = self.ttn(
+            proposals1, features)
+
+        reg1se = self.reg1se(torch.cat([starts_feat1, ends_feat1], dim=1))
+
+        features1xw = torch.cat([starts_feat1, actions_feat1, ends_feat1],
+                                dim=2)
+        reg1xw = self.reg1xw(features1xw).squeeze(2)
+
+        preds_iou1 = reg1xw[:, 2].sigmoid()
+        reg1xw = reg1xw[:, :2]
+
+        if training:
+            proposals2xw = bbox_xw_transform_inv(proposals1, reg1xw, 0.1, 0.2)
+            proposals2se = bbox_se_transform_inv(proposals1, reg1se, 1.0)
+
+            iou1 = batch_iou(proposals1, gt_boxes)
+            targets1se = bbox_se_transform_batch(proposals1, gt_boxes)
+            targets1xw = bbox_xw_transform_batch(proposals1, gt_boxes)
+            rloss1se = self.regress_loss(reg1se, targets1se, iou1, iou_thres)
+            rloss1xw = self.regress_loss(reg1xw, targets1xw, iou1, iou_thres)
+            rloss1 = rloss1se + rloss1xw
+            iloss1 = self.iou_loss(preds_iou1, iou1, iou_thres=iou_thres)
+        else:
+            proposals2xw = bbox_xw_transform_inv(proposals1, reg1xw, 0.1, 0.2)
+            proposals2se = bbox_se_transform_inv(proposals1, reg1se, 0.2)
+            rloss1 = iloss1 = 0
+        proposals2 = (proposals2se + proposals2xw) / 2.0
+        proposals2 = torch.clamp(proposals2, min=0.)
+        return preds_iou1, proposals2, rloss1, iloss1
+
+    def regress_loss(self, regression, targets, iou_with_gt, iou_thres):
+        weight = (iou_with_gt >= iou_thres).float().unsqueeze(1)
+        reg_loss = F.smooth_l1_loss(regression, targets, reduction='none')
+        if weight.sum() > 0:
+            reg_loss = (weight * reg_loss).sum() / weight.sum()
+        else:
+            reg_loss = (weight * reg_loss).sum()
+        return reg_loss
+
+    def iou_loss(self, preds_iou, match_iou, iou_thres):
+        preds_iou = preds_iou.view(-1)
+        u_hmask = (match_iou > iou_thres).float()
+        u_mmask = ((match_iou <= iou_thres) & (match_iou > 0.3)).float()
+        u_lmask = (match_iou <= 0.3).float()
+
+        num_h, num_m, num_l = u_hmask.sum(), u_mmask.sum(), u_lmask.sum()
+
+        bs, device = u_hmask.size()[0], u_hmask.device
+
+        r_m = min(num_h / num_m, 1)
+        u_smmask = torch.rand(bs, device=device) * u_mmask
+        u_smmask = (u_smmask > (1. - r_m)).float()
+
+        r_l = min(num_h / num_l, 1)
+        u_slmask = torch.rand(bs, device=device) * u_lmask
+        u_slmask = (u_slmask > (1. - r_l)).float()
+
+        iou_weights = u_hmask + u_smmask + u_slmask
+        iou_loss = F.smooth_l1_loss(preds_iou, match_iou, reduction='none')
+        if iou_weights.sum() > 0:
+            iou_loss = (iou_loss * iou_weights).sum() / iou_weights.sum()
+        else:
+            iou_loss = (iou_loss * iou_weights).sum()
+        return iou_loss
+
+
+@MODELS.register_module()
+class TCANet(BaseModel):
+    """Temporal Context Aggregation Network.
+
+    Please refer `Temporal Context Aggregation Network for Temporal Action
+    Proposal Refinement <https://arxiv.org/abs/2103.13141>`_.
+    Code Reference:
+    https://github.com/qinzhi-0110/Temporal-Context-Aggregation-Network-Pytorch
+    """
+
+    def __init__(self,
+                 feat_dim: int = 2304,
+                 se_sample_num: int = 32,
+                 action_sample_num: int = 64,
+                 temporal_dim: int = 100,
+                 window_size: int = 9,
+                 lgte_num: int = 2,
+                 soft_nms_alpha: float = 0.4,
+                 soft_nms_low_threshold: float = 0.0,
+                 soft_nms_high_threshold: float = 0.0,
+                 post_process_top_k: int = 100,
+                 feature_extraction_interval: int = 16,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super(TCANet, self).__init__(init_cfg)
+
+        self.soft_nms_alpha = soft_nms_alpha
+        self.soft_nms_low_threshold = soft_nms_low_threshold
+        self.soft_nms_high_threshold = soft_nms_high_threshold
+        self.feature_extraction_interval = feature_extraction_interval
+        self.post_process_top_k = post_process_top_k
+
+        hidden_dim = 512
+        self.x_1d_b_f = nn.Sequential(
+            nn.Conv1d(
+                feat_dim, hidden_dim, kernel_size=3, padding=1, groups=4),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(
+                hidden_dim, hidden_dim, kernel_size=3, padding=1, groups=4),
+            nn.ReLU(inplace=True),
+        )
+
+        for i in 1, 2, 3:
+            tbr = TBR(
+                se_sample_num=se_sample_num,
+                action_sample_num=action_sample_num,
+                temporal_dim=temporal_dim,
+                init_cfg=init_cfg,
+                **kwargs)
+            setattr(self, f'tbr{i}', tbr)
+
+        self.lgtes = nn.ModuleList([
+            LGTE(
+                input_dim=hidden_dim,
+                dropout=0.1,
+                temporal_dim=temporal_dim,
+                window_size=window_size,
+                init_cfg=init_cfg,
+                **kwargs) for i in range(lgte_num)
+        ])
+
+    def forward(self, inputs, data_samples, mode, **kwargs):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes:
+
+        - ``tensor``: Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - ``predict``: Forward and return the predictions, which are fully
+        processed to a list of :obj:`ActionDataSample`.
+        - ``loss``: Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            data_samples (List[:obj:`ActionDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to ``tensor``.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of ``ActionDataSample``.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if not isinstance(input, Tensor):
+            inputs = torch.stack(inputs)
+        if mode == 'tensor':
+            return self._forward(inputs, **kwargs)
+        if mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    def _forward(self, x):
+        """Define the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+        Returns:
+            torch.Tensor: The output of the module.
+        """
+        x = self.x_1d_b_f(x)
+        for layer in self.lgtes:
+            x = layer(x)
+        return x
+
+    def loss(self, batch_inputs, batch_data_samples, **kwargs):
+        features = self._forward(batch_inputs)
+        proposals_ = [
+            sample.proposals['proposals'] for sample in batch_data_samples
+        ]
+
+        batch_size = len(proposals_)
+        proposals_num = max([_.shape[0] for _ in proposals_])
+
+        proposals = torch.zeros((batch_size, proposals_num, 3),
+                                device=features.device)
+        for i, proposal in enumerate(proposals_):
+            proposals[i, :proposal.shape[0]] = proposal
+
+        gt_boxes_ = [
+            sample.gt_instances['gt_bbox'] for sample in batch_data_samples
+        ]
+        gt_boxes = torch.zeros((batch_size, proposals_num, 2),
+                               device=features.device)
+        for i, gt_box in enumerate(gt_boxes_):
+            L = gt_box.shape[0]
+            if L <= proposals_num:
+                gt_boxes[i, :L] = gt_box
+            else:
+                random_index = torch.randperm(L)[:proposals_num]
+                gt_boxes[i] = gt_box[random_index]
+
+        for i in range(batch_size):
+            proposals[i, :, 2] = i
+        proposals = proposals.view(batch_size * proposals_num, 3)
+        proposals_select = proposals[:, 0:2].sum(dim=1) > 0
+        proposals = proposals[proposals_select, :]
+
+        features = features[proposals[:, 2].long()]
+
+        gt_boxes = gt_boxes.view(batch_size * proposals_num, 2)
+        gt_boxes = gt_boxes[proposals_select, :]
+
+        _, proposals1, rloss1, iloss1 = self.tbr1(proposals, features,
+                                                  gt_boxes, 0.5, True)
+        _, proposals2, rloss2, iloss2 = self.tbr2(proposals1, features,
+                                                  gt_boxes, 0.6, True)
+        _, _, rloss3, iloss3 = self.tbr3(proposals2, features, gt_boxes, 0.7,
+                                         True)
+
+        loss_dict = dict(
+            rloss1=rloss1,
+            rloss2=rloss2,
+            rloss3=rloss3,
+            iloss1=iloss1,
+            iloss2=iloss2,
+            iloss3=iloss3)
+        return loss_dict
+
+    def predict(self, batch_inputs, batch_data_samples, **kwargs):
+        features = self._forward(batch_inputs)
+        proposals_ = [
+            sample.proposals['proposals'] for sample in batch_data_samples
+        ]
+
+        batch_size = len(proposals_)
+        proposals_num = max([_.shape[0] for _ in proposals_])
+
+        proposals = torch.zeros((batch_size, proposals_num, 3),
+                                device=features.device)
+        for i, proposal in enumerate(proposals_):
+            proposals[i, :proposal.shape[0]] = proposal
+
+        scores = proposals[:, :, 2]
+        for i in range(batch_size):
+            proposals[i, :, 2] = i
+
+        proposals = proposals.view(batch_size * proposals_num, 3)
+        proposals_select = proposals[:, 0:2].sum(dim=1) > 0
+        proposals = proposals[proposals_select, :]
+        scores = scores.view(-1)[proposals_select]
+
+        features = features[proposals[:, 2].long()]
+
+        preds_iou1, proposals1 = self.tbr1(proposals, features, None, 0.5,
+                                           False)[:2]
+        preds_iou2, proposals2 = self.tbr2(proposals1, features, None, 0.6,
+                                           False)[:2]
+        preds_iou3, proposals3 = self.tbr3(proposals2, features, None, 0.7,
+                                           False)[:2]
+
+        all_proposals = []
+        # all_proposals = [proposals]
+        all_proposals += [
+            torch.cat([proposals1, (scores * preds_iou1).view(-1, 1)], dim=1)
+        ]
+        all_proposals += [
+            torch.cat([proposals2, (scores * preds_iou2).view(-1, 1)], dim=1)
+        ]
+        all_proposals += [
+            torch.cat([proposals3, (scores * preds_iou3).view(-1, 1)], dim=1)
+        ]
+
+        all_proposals = torch.cat(all_proposals, dim=0).cpu().numpy()
+        video_info = batch_data_samples[0].metainfo
+        proposal_list = post_processing(all_proposals, video_info,
+                                        self.soft_nms_alpha,
+                                        self.soft_nms_low_threshold,
+                                        self.soft_nms_high_threshold,
+                                        self.post_process_top_k,
+                                        self.feature_extraction_interval)
+        output = [
+            dict(
+                video_name=video_info['video_name'],
+                proposal_list=proposal_list)
+        ]
+        return output
diff --git a/mmaction/models/localizers/utils/__init__.py b/mmaction/models/localizers/utils/__init__.py
index 2263eb242f..bc7057a2bb 100644
--- a/mmaction/models/localizers/utils/__init__.py
+++ b/mmaction/models/localizers/utils/__init__.py
@@ -2,8 +2,13 @@
 from .bsn_utils import generate_bsp_feature, generate_candidate_proposals
 from .proposal_utils import (post_processing, soft_nms, temporal_iop,
                              temporal_iou)
+from .tcanet_utils import (batch_iou, bbox_se_transform_batch,
+                           bbox_se_transform_inv, bbox_xw_transform_batch,
+                           bbox_xw_transform_inv)
 
 __all__ = [
-    'generate_bsp_feature', 'generate_candidate_proposals', 'soft_nms',
-    'temporal_iop', 'temporal_iou', 'post_processing'
+    'batch_iou', 'bbox_se_transform_batch', 'bbox_se_transform_inv',
+    'bbox_xw_transform_batch', 'bbox_xw_transform_inv', 'generate_bsp_feature',
+    'generate_candidate_proposals', 'post_processing', 'soft_nms',
+    'temporal_iop', 'temporal_iou'
 ]
diff --git a/mmaction/models/localizers/utils/tcanet_utils.py b/mmaction/models/localizers/utils/tcanet_utils.py
new file mode 100644
index 0000000000..33b35bcf89
--- /dev/null
+++ b/mmaction/models/localizers/utils/tcanet_utils.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copied from
+# 'https://github.com/qinzhi-0110/'
+# 'Temporal-Context-Aggregation-Network-Pytorch/'
+# 'blob/main/utils.py'
+# TODO: refactor
+import torch
+
+
+def batch_iou(proposals, gt_boxes):
+    len_proposals = proposals[:, 1] - proposals[:, 0]
+    int_xmin = torch.max(proposals[:, 0], gt_boxes[:, 0])
+    int_xmax = torch.min(proposals[:, 1], gt_boxes[:, 1])
+    inter_len = torch.clamp(int_xmax - int_xmin, min=0.)
+    union_len = len_proposals - inter_len + gt_boxes[:, 1] - gt_boxes[:, 0]
+    jaccard = inter_len / (union_len + 0.00001)
+    return jaccard
+
+
+def bbox_xw_transform_inv(boxes, deltas, dx_w, dw_w):
+    widths = boxes[:, 1] - boxes[:, 0]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+
+    dx = deltas[:, 0] * dx_w
+    dw = deltas[:, 1] * dw_w
+
+    pred_ctr_x = dx * widths + ctr_x
+    pred_w = torch.exp(dw) * widths
+
+    pred_boxes = deltas.clone()
+    # x1
+    pred_boxes[:, 0] = pred_ctr_x - 0.5 * pred_w
+    # x2
+    pred_boxes[:, 1] = pred_ctr_x + 0.5 * pred_w
+
+    return pred_boxes
+
+
+def bbox_xw_transform_batch(ex_rois, gt_rois):
+    ex_widths = torch.clamp(ex_rois[:, 1] - ex_rois[:, 0], min=0.00001)
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
+
+    gt_widths = torch.clamp(gt_rois[:, 1] - gt_rois[:, 0], min=0.00001)
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
+    targets_dw = torch.log(gt_widths / ex_widths)
+    targets = torch.stack((targets_dx, targets_dw), dim=1)
+    return targets
+
+
+def bbox_se_transform_batch(ex_rois, gt_rois):
+    ex_widths = torch.clamp(ex_rois[:, 1] - ex_rois[:, 0], min=0.00001)
+
+    s_offset = gt_rois[:, 0] - ex_rois[:, 0]
+    e_offset = gt_rois[:, 1] - ex_rois[:, 1]
+
+    targets_s = s_offset / ex_widths
+    targets_e = e_offset / ex_widths
+    targets = torch.stack((targets_s, targets_e), dim=1)
+    return targets
+
+
+def bbox_se_transform_inv(boxes, deltas, dse_w):
+    widths = boxes[:, 1] - boxes[:, 0]
+    s_offset = deltas[:, 0] * widths * dse_w
+    e_offset = deltas[:, 1] * widths * dse_w
+    pred_boxes = deltas.clone()
+    pred_boxes[:, 0] = boxes[:, 0] + s_offset
+    pred_boxes[:, 1] = boxes[:, 1] + e_offset
+    return pred_boxes
diff --git a/mmaction/models/recognizers/base.py b/mmaction/models/recognizers/base.py
index 8066ceed0d..7ce2a51b1f 100644
--- a/mmaction/models/recognizers/base.py
+++ b/mmaction/models/recognizers/base.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import inspect
 import warnings
 from abc import ABCMeta, abstractmethod
 
@@ -44,10 +45,21 @@ def __init__(self,
         super(BaseRecognizer,
               self).__init__(data_preprocessor=data_preprocessor)
 
+        def is_from(module, pkg_name):
+            # check whether the backbone is from pkg
+            model_type = module['type']
+            if isinstance(model_type, str):
+                return model_type.startswith(pkg_name)
+            elif inspect.isclass(model_type) or inspect.isfunction(model_type):
+                module_name = model_type.__module__
+                return pkg_name in module_name
+            else:
+                raise TypeError(
+                    f'Unsupported type of module {type(module["type"])}')
+
         # Record the source of the backbone.
         self.backbone_from = 'mmaction2'
-
-        if backbone['type'].startswith('mmcls.'):
+        if is_from(backbone, 'mmcls.'):
             try:
                 # Register all mmcls models.
                 import mmcls.models  # noqa: F401
@@ -55,30 +67,51 @@ def __init__(self,
                 raise ImportError('Please install mmcls to use this backbone.')
             self.backbone = MODELS.build(backbone)
             self.backbone_from = 'mmcls'
-        elif backbone['type'].startswith('torchvision.'):
+        elif is_from(backbone, 'mmpretrain.'):
+            try:
+                # Register all mmpretrain models.
+                import mmpretrain.models  # noqa: F401
+            except (ImportError, ModuleNotFoundError):
+                raise ImportError(
+                    'Please install mmpretrain to use this backbone.')
+            self.backbone = MODELS.build(backbone)
+            self.backbone_from = 'mmpretrain'
+        elif is_from(backbone, 'torchvision.'):
             try:
                 import torchvision.models
             except (ImportError, ModuleNotFoundError):
                 raise ImportError('Please install torchvision to use this '
                                   'backbone.')
-            backbone_type = backbone.pop('type')[12:]
-            self.backbone = torchvision.models.__dict__[backbone_type](
-                **backbone)
+            self.backbone_from = 'torchvision'
+            self.feature_shape = backbone.pop('feature_shape', None)
+            backbone_type = backbone.pop('type')
+            if isinstance(backbone_type, str):
+                backbone_type = backbone_type[12:]
+                self.backbone = torchvision.models.__dict__[backbone_type](
+                    **backbone)
+            else:
+                self.backbone = backbone_type(**backbone)
             # disable the classifier
             self.backbone.classifier = nn.Identity()
             self.backbone.fc = nn.Identity()
-            self.backbone_from = 'torchvision'
-        elif backbone['type'].startswith('timm.'):
+        elif is_from(backbone, 'timm.'):
+            # currently, only support use `str` as backbone type
             try:
                 import timm
             except (ImportError, ModuleNotFoundError):
-                raise ImportError('Please install timm to use this '
+                raise ImportError('Please install timm>=0.9.0 to use this '
                                   'backbone.')
-            backbone_type = backbone.pop('type')[5:]
+            self.backbone_from = 'timm'
+            self.feature_shape = backbone.pop('feature_shape', None)
             # disable the classifier
             backbone['num_classes'] = 0
-            self.backbone = timm.create_model(backbone_type, **backbone)
-            self.backbone_from = 'timm'
+            backbone_type = backbone.pop('type')
+            if isinstance(backbone_type, str):
+                backbone_type = backbone_type[5:]
+                self.backbone = timm.create_model(backbone_type, **backbone)
+            else:
+                raise TypeError(
+                    f'Unsupported timm backbone type: {type(backbone_type)}')
         else:
             self.backbone = MODELS.build(backbone)
 
@@ -107,13 +140,19 @@ def with_cls_head(self) -> bool:
 
     def init_weights(self) -> None:
         """Initialize the model network weights."""
-        super().init_weights()
         if self.backbone_from in ['torchvision', 'timm']:
             warnings.warn('We do not initialize weights for backbones in '
                           f'{self.backbone_from}, since the weights for '
                           f'backbones in {self.backbone_from} are initialized '
                           'in their __init__ functions.')
 
+            def fake_init():
+                pass
+
+            # avoid repeated initialization
+            self.backbone.init_weights = fake_init
+        super().init_weights()
+
     def loss(self, inputs: torch.Tensor, data_samples: SampleList,
              **kwargs) -> dict:
         """Calculate losses from a batch of inputs and data samples.
@@ -204,7 +243,7 @@ def forward(self,
         Args:
             inputs (torch.Tensor): The input tensor with shape
                 (N, C, ...) in general.
-            data_samples (List[``ActionDataSample`1], optional): The
+            data_samples (List[``ActionDataSample], optional): The
                 annotation data of every samples. Defaults to None.
             mode (str): Return what kind of value. Defaults to ``tensor``.
 
diff --git a/mmaction/models/recognizers/recognizer2d.py b/mmaction/models/recognizers/recognizer2d.py
index 49ff448908..34c02378c6 100644
--- a/mmaction/models/recognizers/recognizer2d.py
+++ b/mmaction/models/recognizers/recognizer2d.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch
 import torch.nn as nn
-from torch import Tensor
 
 from mmaction.registry import MODELS
 from mmaction.utils import SampleList
@@ -12,7 +12,7 @@ class Recognizer2D(BaseRecognizer):
     """2D recognizer model framework."""
 
     def extract_feat(self,
-                     inputs: Tensor,
+                     inputs: torch.Tensor,
                      stage: str = 'neck',
                      data_samples: SampleList = None,
                      test_mode: bool = False) -> tuple:
@@ -48,6 +48,42 @@ def extract_feat(self,
         #   4) `num_clips` in `SampleFrames` or its subclass if `clip_len != 1`
         inputs = inputs.view((-1, ) + inputs.shape[2:])
 
+        def forward_once(batch_imgs):
+            # Extract features through backbone.
+            if (hasattr(self.backbone, 'features')
+                    and self.backbone_from == 'torchvision'):
+                x = self.backbone.features(batch_imgs)
+            elif self.backbone_from == 'timm':
+                x = self.backbone.forward_features(batch_imgs)
+            elif self.backbone_from in ['mmcls', 'mmpretrain']:
+                x = self.backbone(batch_imgs)
+                if isinstance(x, tuple):
+                    assert len(x) == 1
+                    x = x[0]
+            else:
+                x = self.backbone(batch_imgs)
+
+            if self.backbone_from in ['torchvision', 'timm']:
+                if not self.feature_shape:
+                    # Transformer-based feature shape: B x L x C.
+                    if len(x.shape) == 3:
+                        self.feature_shape = 'NLC'
+                    # Resnet-based feature shape: B x C x Hs x Ws.
+                    elif len(x.shape) == 4:
+                        self.feature_shape = 'NCHW'
+
+                if self.feature_shape == 'NHWC':
+                    x = nn.AdaptiveAvgPool2d(1)(x.permute(0, 3, 1,
+                                                          2))  # B x C x 1 x 1
+                elif self.feature_shape == 'NCHW':
+                    x = nn.AdaptiveAvgPool2d(1)(x)  # B x C x 1 x 1
+                elif self.feature_shape == 'NLC':
+                    x = nn.AdaptiveAvgPool1d(1)(x.transpose(1, 2))  # B x C x 1
+
+                x = x.reshape((x.shape[0], -1))  # B x C
+                x = x.reshape(x.shape + (1, 1))  # B x C x 1 x 1
+            return x
+
         # Check settings of `fcn_test`.
         fcn_test = False
         if test_mode:
@@ -58,29 +94,52 @@ def extract_feat(self,
                                              self.backbone.num_segments)
             loss_predict_kwargs['fcn_test'] = fcn_test
 
-        # Extract features through backbone.
-        if (hasattr(self.backbone, 'features')
-                and self.backbone_from == 'torchvision'):
-            x = self.backbone.features(inputs)
-        elif self.backbone_from == 'timm':
-            x = self.backbone.forward_features(inputs)
-        elif self.backbone_from == 'mmcls':
-            x = self.backbone(inputs)
-            if isinstance(x, tuple):
-                assert len(x) == 1
-                x = x[0]
+            # inference with batch size of `max_testing_views` if set
+            if self.test_cfg is not None and self.test_cfg.get(
+                    'max_testing_views', False):
+                max_testing_views = self.test_cfg.get('max_testing_views')
+                assert isinstance(max_testing_views, int)
+                # backbone specify num_segments
+                num_segments = self.backbone.get('num_segments')
+                if num_segments is not None:
+                    assert max_testing_views % num_segments == 0, \
+                        'make sure that max_testing_views is a multiple of ' \
+                        'num_segments, but got {max_testing_views} and '\
+                        '{num_segments}'
+
+                total_views = inputs.shape[0]
+                view_ptr = 0
+                feats = []
+                while view_ptr < total_views:
+                    batch_imgs = inputs[view_ptr:view_ptr + max_testing_views]
+                    feat = forward_once(batch_imgs)
+                    if self.with_neck:
+                        feat, _ = self.neck(feat)
+                    feats.append(feat)
+                    view_ptr += max_testing_views
+
+                def recursively_cat(feats):
+                    # recursively traverse feats until it's a tensor,
+                    # then concat
+                    out_feats = []
+                    for e_idx, elem in enumerate(feats[0]):
+                        batch_elem = [feat[e_idx] for feat in feats]
+                        if not isinstance(elem, torch.Tensor):
+                            batch_elem = recursively_cat(batch_elem)
+                        else:
+                            batch_elem = torch.cat(batch_elem)
+                        out_feats.append(batch_elem)
+
+                    return tuple(out_feats)
+
+                if isinstance(feats[0], tuple):
+                    x = recursively_cat(feats)
+                else:
+                    x = torch.cat(feats)
+            else:
+                x = forward_once(inputs)
         else:
-            x = self.backbone(inputs)
-
-        if self.backbone_from in ['torchvision', 'timm']:
-            # Transformer-based feature shape: B x L x C.
-            if len(x.shape) == 3 and x.shape[2] > 1:
-                x = nn.AdaptiveAvgPool1d(1)(x.transpose(1, 2))  # B x C x 1
-            # Resnet-based feature shape: B x C x Hs x Ws。
-            if len(x.shape) == 4 and (x.shape[2] > 1 or x.shape[3] > 1):
-                x = nn.AdaptiveAvgPool2d(1)(x)  # B x C x 1 x 1
-            x = x.reshape((x.shape[0], -1))  # B x C
-            x = x.reshape(x.shape + (1, 1))  # B x C x 1 x 1
+            x = forward_once(inputs)
 
         # Return features extracted through backbone.
         if stage == 'backbone':
diff --git a/mmaction/models/recognizers/recognizer3d.py b/mmaction/models/recognizers/recognizer3d.py
index 81b86534ac..faecb8ea9c 100644
--- a/mmaction/models/recognizers/recognizer3d.py
+++ b/mmaction/models/recognizers/recognizer3d.py
@@ -69,9 +69,10 @@ def extract_feat(self,
                         feat, _ = self.neck(feat)
                     feats.append(feat)
                     view_ptr += max_testing_views
-                # recursively traverse feats until it's a tensor, then concat
 
                 def recursively_cat(feats):
+                    # recursively traverse feats until it's a tensor,
+                    # then concat
                     out_feats = []
                     for e_idx, elem in enumerate(feats[0]):
                         batch_elem = [feat[e_idx] for feat in feats]
diff --git a/mmaction/models/recognizers/recognizer3d_mm.py b/mmaction/models/recognizers/recognizer3d_mm.py
index 1d7099b3c3..2b42269824 100644
--- a/mmaction/models/recognizers/recognizer3d_mm.py
+++ b/mmaction/models/recognizers/recognizer3d_mm.py
@@ -4,7 +4,7 @@
 import torch
 
 from mmaction.registry import MODELS
-from mmaction.utils.typing import OptSampleList
+from mmaction.utils import OptSampleList
 from .base import BaseRecognizer
 
 
diff --git a/mmaction/models/similarity/__init__.py b/mmaction/models/similarity/__init__.py
new file mode 100644
index 0000000000..e69e50483e
--- /dev/null
+++ b/mmaction/models/similarity/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .adapters import SimpleMeanAdapter, TransformerAdapter
+from .clip_similarity import CLIPSimilarity
+
+__all__ = ['CLIPSimilarity', 'TransformerAdapter', 'SimpleMeanAdapter']
diff --git a/mmaction/models/similarity/adapters.py b/mmaction/models/similarity/adapters.py
new file mode 100644
index 0000000000..33d1e78665
--- /dev/null
+++ b/mmaction/models/similarity/adapters.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmaction.registry import MODELS
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Perform quick gelu."""
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+    """"ResidualAttentionBlock.
+
+    Args:
+        d_model (int): The dimension of the model.
+        n_head (int): The number of heads.
+        attn_mask (torch.Tensor, optional): The attention mask.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: Optional[torch.Tensor] = None) -> None:
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor) -> torch.Tensor:
+        """Perform attention."""
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+    """"ResidualAttentionBlock.
+
+    Args:
+        width (int): The width of transformer.
+        heads (int): The number of heads of transformer.
+        layers (int): The number of layers of transformer.
+        attn_mask (torch.Tensor, optional): The attention mask.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: Optional[torch.Tensor] = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        return self.resblocks(x)
+
+
+@MODELS.register_module()
+class TransformerAdapter(BaseModule):
+    """"Transformer adapter, modified from github.com/openai/CLIP.
+
+    Args:
+        num_segs (int): The number of segments.
+        transformer_width (int): The width of transformer.
+        transformer_heads (int): The number of heads of transformer.
+        transformer_layers (int): The number of layers of transformer.
+    """
+
+    def __init__(self, num_segs: int, transformer_width: int,
+                 transformer_heads: int, transformer_layers: int) -> None:
+        super(TransformerAdapter, self).__init__()
+        self.num_segs = num_segs
+
+        self.positional_embedding = nn.Parameter(
+            torch.empty(num_segs, transformer_width))
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads)
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+
+        nn.init.normal_(self.positional_embedding, std=0.01)
+
+        proj_std = (self.transformer.width**-0.5) * (
+            (2 * self.transformer.layers)**-0.5)
+        attn_std = self.transformer.width**-0.5
+        fc_std = (2 * self.transformer.width)**-0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        b, seq_length, c = x.size()
+
+        x_original = x
+        x = x + self.positional_embedding
+        x = x.transpose(0, 1)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.transpose(0, 1)  # LND -> NLD
+        x = x.type(x_original.dtype) + x_original
+        return x.mean(dim=1)
+
+
+@MODELS.register_module()
+class SimpleMeanAdapter(BaseModule):
+    """Average features adapter.
+
+    Args:
+        dim (int): The dimension to perform averaging. Defaults to 1.
+    """
+
+    def __init__(self, dim: Union[int, Tuple[int]] = 1) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        return x.mean(dim=self.dim)
diff --git a/mmaction/models/similarity/clip_similarity.py b/mmaction/models/similarity/clip_similarity.py
new file mode 100644
index 0000000000..1dcb300300
--- /dev/null
+++ b/mmaction/models/similarity/clip_similarity.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, List, Tuple
+
+import torch
+from mmengine.dist import all_gather, get_rank
+from mmengine.model import BaseModel
+from mmengine.structures import InstanceData
+
+from mmaction.registry import MODELS
+from mmaction.utils import ForwardResults, OptSampleList
+
+
+class GatherLayer(torch.autograd.Function):
+    """Gather tensors from all process, supporting backward propagation."""
+
+    @staticmethod
+    def forward(ctx: Any, input: torch.Tensor) -> Tuple[List]:
+        ctx.save_for_backward(input)
+        output = all_gather(input)
+        return tuple(output)
+
+    @staticmethod
+    def backward(ctx: Any, *grads: torch.Tensor) -> torch.Tensor:
+        input, = ctx.saved_tensors
+        grad_out = torch.zeros_like(input)
+        grad_out[:] = grads[get_rank()]
+        return grad_out
+
+
+@MODELS.register_module()
+class CLIPSimilarity(BaseModel):
+    """CLIP-based similarity model.
+
+    Args:
+        clip_arch (str): The architecture of the clip model.
+            Supported choices are `'ViT-B/32'`, `'ViT-B/16'`,
+            `'ViT-L/14'` and `'ViT-L/14@336px'`.
+        data_preprocessor (dict): The pre-process config.
+        adapter (dict): The 3D adapter config.
+        to_float32 (bool): Whether to convert the dtype of params of clip
+            model to float32.
+        frozen_layers: Layers to be frozen (all params fixed). -1 means
+            not freezing any parameters. Defaults to -1.
+        loss (dict): The config of loss. Defaults to
+            `dict(type='CrossEntropyLoss', loss_weight=0.5)`.
+    """
+
+    def __init__(
+        self,
+        clip_arch: str,
+        data_preprocessor: Dict[str, Dict],
+        adapter: Dict,
+        to_float32: bool = False,
+        frozen_layers: int = -1,
+        loss: Dict = dict(type='CrossEntropyLoss', loss_weight=0.5)
+    ) -> None:
+        super(CLIPSimilarity,
+              self).__init__(data_preprocessor=data_preprocessor)
+
+        try:
+            import clip
+        except ImportError:
+            raise ImportError('Please run `pip install '
+                              'git+https://github.com/openai/CLIP.git` '
+                              'to install clip first. ')
+
+        self.clip = clip.load(clip_arch, device='cpu')[0]
+        if to_float32:
+            self.clip.float()
+        self.loss = MODELS.build(loss)
+        self.adapter = MODELS.build(adapter)
+        self.frozen_layers = frozen_layers
+        self._freeze_stages()
+
+    def encode_video(self, video: torch.Tensor) -> torch.Tensor:
+        """Encode video."""
+        b, n, c, h, w = video.shape
+        video = video.view(-1, c, h, w)
+        frames_features = self.encode_image(video)
+        frames_features = frames_features.view(b, n, -1)
+        video_features = self.adapter(frames_features)
+        return video_features
+
+    def encode_image(self, image: torch.Tensor) -> torch.Tensor:
+        """Encode image."""
+        return self.clip.encode_image(image)
+
+    def encode_text(self, text: torch.Tensor) -> torch.Tensor:
+        """Encode text."""
+        return self.clip.encode_text(text)
+
+    def extract_feat(self,
+                     inputs: Dict[str, torch.Tensor],
+                     norm: bool = True) -> Tuple:
+        """Extract features."""
+        text_inputs = inputs['text']
+        video_inputs = inputs['imgs']
+        text_features = self.encode_text(text_inputs)
+        video_features = self.encode_video(video_inputs)
+
+        if norm:
+            text_features = text_features / text_features.norm(
+                dim=-1, keepdim=True)
+            video_features = video_features / video_features.norm(
+                dim=-1, keepdim=True)
+
+        return video_features, text_features
+
+    def forward(self,
+                inputs: Dict[str, torch.Tensor],
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor') -> ForwardResults:
+        """Forward function."""
+
+        if mode == 'tensor':
+            return self.extract_feat(inputs, norm=False)
+
+        elif mode == 'loss':
+            video_features, text_features = self.extract_feat(inputs)
+            video_features = torch.cat(
+                GatherLayer.apply(video_features), dim=0)
+            text_features = torch.cat(GatherLayer.apply(text_features), dim=0)
+
+            logit_scale = self.clip.logit_scale.exp()
+            logits_per_video = logit_scale * video_features @ text_features.t()
+            logits_per_text = logits_per_video.t()
+
+            labels = torch.arange(logits_per_video.shape[0]).to(
+                logit_scale.device)
+
+            sim_loss_v2t = self.loss(logits_per_video, labels)
+            sim_loss_t2v = self.loss(logits_per_text, labels)
+
+            losses = dict()
+            losses['sim_loss_v2t'] = sim_loss_v2t
+            losses['sim_loss_t2v'] = sim_loss_t2v
+            return losses
+
+        elif mode == 'predict':
+            video_features, text_features = self.extract_feat(inputs)
+            for ds, vf, tf in zip(data_samples, video_features, text_features):
+                features = InstanceData(video_feature=vf, text_feature=tf)
+                ds.features = features
+            return data_samples
+
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    def train(self, mode: bool = True) -> None:
+        """Set the optimization status when training."""
+        super().train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self) -> None:
+        """Prevent all the parameters from being optimized before
+        ``self.frozen_layers``."""
+
+        if self.frozen_layers >= 0:
+            top_layers = [
+                'ln_final', 'text_projection', 'logit_scale', 'visual.ln_post',
+                'visual.proj'
+            ]
+            mid_layers = [
+                'visual.transformer.resblocks', 'transformer.resblocks'
+            ]
+
+            for name, param in self.clip.named_parameters():
+                if any(name.find(n) == 0 for n in top_layers):
+                    continue
+                elif any(name.find(n) == 0 for n in mid_layers):
+                    layer_n = int(name.split('.resblocks.')[1].split('.')[0])
+                    if layer_n >= self.frozen_layers:
+                        continue
+                param.requires_grad = False
diff --git a/mmaction/models/utils/graph.py b/mmaction/models/utils/graph.py
index 7575640fc8..e8a5624787 100644
--- a/mmaction/models/utils/graph.py
+++ b/mmaction/models/utils/graph.py
@@ -111,8 +111,10 @@ class Graph:
     """The Graph to model the skeletons.
 
     Args:
-        layout (str): must be one of the following candidates:
-            'openpose', 'nturgb+d', 'coco'. Defaults to ``'coco'``.
+        layout (str or dict): must be one of the following candidates:
+            'openpose', 'nturgb+d', 'coco', or a dict with the following
+            keys: 'num_node', 'inward', and 'center'.
+            Defaults to ``'coco'``.
         mode (str): must be one of the following candidates:
             'stgcn_spatial', 'spatial'. Defaults to ``'spatial'``.
         max_hop (int): the maximal distance between two connected
@@ -120,7 +122,7 @@ class Graph:
     """
 
     def __init__(self,
-                 layout: str = 'coco',
+                 layout: Union[str, dict] = 'coco',
                  mode: str = 'spatial',
                  max_hop: int = 1) -> None:
 
@@ -128,7 +130,12 @@ def __init__(self,
         self.layout = layout
         self.mode = mode
 
-        assert layout in ['openpose', 'nturgb+d', 'coco']
+        if isinstance(layout, dict):
+            assert 'num_node' in layout
+            assert 'inward' in layout
+            assert 'center' in layout
+        else:
+            assert layout in ['openpose', 'nturgb+d', 'coco']
 
         self.set_layout(layout)
         self.hop_dis = get_hop_distance(self.num_node, self.inward, max_hop)
@@ -163,6 +170,10 @@ def set_layout(self, layout: str) -> None:
                            (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0),
                            (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)]
             self.center = 0
+        elif isinstance(layout, dict):
+            self.num_node = layout['num_node']
+            self.inward = layout['inward']
+            self.center = layout['center']
         else:
             raise ValueError(f'Do Not Exist This Layout: {layout}')
         self.self_link = [(i, i) for i in range(self.num_node)]
diff --git a/mmaction/structures/action_data_sample.py b/mmaction/structures/action_data_sample.py
index 196b080136..6ea146cba2 100644
--- a/mmaction/structures/action_data_sample.py
+++ b/mmaction/structures/action_data_sample.py
@@ -176,3 +176,18 @@ def gt_instances(self, value):
     def gt_instances(self):
         """Deleter of `gt_instances`"""
         del self._gt_instances
+
+    @property
+    def features(self):
+        """Setter of `features`"""
+        return self._features
+
+    @features.setter
+    def features(self, value):
+        """Setter of `features`"""
+        self.set_field(value, '_features', dtype=InstanceData)
+
+    @features.deleter
+    def features(self):
+        """Deleter of `features`"""
+        del self._features
diff --git a/mmaction/testing/__init__.py b/mmaction/testing/__init__.py
index 9f76126057..d5afdab7d5 100644
--- a/mmaction/testing/__init__.py
+++ b/mmaction/testing/__init__.py
@@ -1,13 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from ._utils import (check_norm_state, generate_backbone_demo_inputs,
-                     generate_detector_demo_inputs,
-                     generate_recognizer_demo_inputs, get_audio_recognizer_cfg,
+                     generate_detector_demo_inputs, get_audio_recognizer_cfg,
                      get_cfg, get_detector_cfg, get_localizer_cfg,
-                     get_recognizer_cfg, get_skeletongcn_cfg)
+                     get_recognizer_cfg, get_similarity_cfg,
+                     get_skeletongcn_cfg)
 
 __all__ = [
-    'check_norm_state', 'generate_backbone_demo_inputs',
-    'generate_recognizer_demo_inputs', 'get_cfg', 'get_recognizer_cfg',
-    'get_audio_recognizer_cfg', 'get_localizer_cfg', 'get_detector_cfg',
-    'generate_detector_demo_inputs', 'get_skeletongcn_cfg'
+    'check_norm_state', 'generate_backbone_demo_inputs', 'get_cfg',
+    'get_recognizer_cfg', 'get_audio_recognizer_cfg', 'get_localizer_cfg',
+    'get_detector_cfg', 'generate_detector_demo_inputs', 'get_skeletongcn_cfg',
+    'get_similarity_cfg'
 ]
diff --git a/mmaction/testing/_utils.py b/mmaction/testing/_utils.py
index 5e6dbf649d..350335b7ea 100644
--- a/mmaction/testing/_utils.py
+++ b/mmaction/testing/_utils.py
@@ -111,7 +111,7 @@ def get_cfg(config_type, fname):
     influencing other tests.
     """
     config_types = ('recognition', 'recognition_audio', 'localization',
-                    'detection', 'skeleton')
+                    'detection', 'skeleton', 'retrieval')
     assert config_type in config_types
 
     repo_dpath = osp.dirname(osp.dirname(osp.dirname(__file__)))
@@ -141,3 +141,7 @@ def get_detector_cfg(fname):
 
 def get_skeletongcn_cfg(fname):
     return get_cfg('skeleton', fname)
+
+
+def get_similarity_cfg(fname):
+    return get_cfg('retrieval', fname)
diff --git a/mmaction/utils/__init__.py b/mmaction/utils/__init__.py
index 02ac88b015..af91d382c4 100644
--- a/mmaction/utils/__init__.py
+++ b/mmaction/utils/__init__.py
@@ -1,11 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .collect_env import collect_env
 from .gradcam_utils import GradCAM
-from .misc import frame_extract, get_random_string, get_shm_dir, get_thread_id
+from .misc import (VideoWriter, frame_extract, get_random_string, get_shm_dir,
+                   get_str_type, get_thread_id)
 from .setup_env import register_all_modules
-from .typing import *  # noqa: F401,F403
+from .typing_utils import *  # noqa: F401,F403
 
 __all__ = [
-    'collect_env', 'get_random_string', 'get_thread_id', 'get_shm_dir',
-    'frame_extract', 'GradCAM', 'register_all_modules'
+    'collect_env',
+    'get_random_string',
+    'get_thread_id',
+    'get_shm_dir',
+    'frame_extract',
+    'GradCAM',
+    'register_all_modules',
+    'VideoWriter',
+    'get_str_type',
 ]
diff --git a/mmaction/utils/collect_env.py b/mmaction/utils/collect_env.py
index 37599b4bf8..506897684d 100644
--- a/mmaction/utils/collect_env.py
+++ b/mmaction/utils/collect_env.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
 from mmengine.utils import get_git_hash
 from mmengine.utils.dl_utils import collect_env as collect_basic_env
 
@@ -10,6 +11,20 @@ def collect_env():
     env_info = collect_basic_env()
     env_info['MMAction2'] = (
         mmaction.__version__ + '+' + get_git_hash(digits=7))
+    env_info['MMCV'] = (mmcv.__version__)
+
+    try:
+        import mmdet
+        env_info['MMDetection'] = (mmdet.__version__)
+    except ImportError:
+        pass
+
+    try:
+        import mmpose
+        env_info['MMPose'] = (mmpose.__version__)
+    except ImportError:
+        pass
+
     return env_info
 
 
diff --git a/mmaction/utils/misc.py b/mmaction/utils/misc.py
index bf4358a2f4..b749eaa4b3 100644
--- a/mmaction/utils/misc.py
+++ b/mmaction/utils/misc.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import ctypes
+import inspect
 import os
 import os.path as osp
 import random
 import string
-from typing import Optional
+from types import FunctionType, ModuleType
+from typing import Optional, Union
 
 import cv2
 import mmcv
@@ -73,3 +75,60 @@ def frame_extract(video_path: str,
         flag, frame = vid.read()
 
     return frame_paths, frames
+
+
+class VideoWriter():
+
+    def __init__(self, video_file, fps):
+        self.video_file = video_file
+        self.fps = fps
+        if video_file.endswith('.mp4'):
+            self.fourcc = 'mp4v'
+        elif video_file.endswith('.avi'):
+            self.fourcc = 'XVID'
+
+        out_dir = osp.dirname(osp.abspath(self.video_file))
+        if not osp.exists(out_dir):
+            os.makedirs(out_dir, exist_ok=True)
+
+    def _init_cv2_writer(self, frame):
+        from cv2 import VideoWriter, VideoWriter_fourcc
+        height, width = frame.shape[:2]
+        resolution = (width, height)
+        self.writer = VideoWriter(self.video_file,
+                                  VideoWriter_fourcc(*self.fourcc), self.fps,
+                                  resolution)
+
+    def write_frame(self, frame):
+        if not getattr(self, 'writer', None):
+            self._init_cv2_writer(frame)
+        self.writer.write(frame)
+
+    def release(self):
+        self.writer.release()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, trace):
+        self.release()
+
+
+def get_str_type(module: Union[str, ModuleType, FunctionType]) -> str:
+    """Return the string type name of module.
+
+    Args:
+        module (str | ModuleType | FunctionType):
+            The target module class
+
+    Returns:
+        Class name of the module
+    """
+    if isinstance(module, str):
+        str_type = module
+    elif inspect.isclass(module) or inspect.isfunction(module):
+        str_type = module.__name__
+    else:
+        return None
+
+    return str_type
diff --git a/mmaction/utils/typing.py b/mmaction/utils/typing_utils.py
similarity index 100%
rename from mmaction/utils/typing.py
rename to mmaction/utils/typing_utils.py
diff --git a/mmaction/version.py b/mmaction/version.py
index 76d189b4d2..acae488d8a 100644
--- a/mmaction/version.py
+++ b/mmaction/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
 
-__version__ = '1.0.0'
+__version__ = '1.1.0'
 
 
 def parse_version_info(version_str: str):
diff --git a/mmaction/visualization/action_visualizer.py b/mmaction/visualization/action_visualizer.py
index 6fc5ae2123..5924669c83 100644
--- a/mmaction/visualization/action_visualizer.py
+++ b/mmaction/visualization/action_visualizer.py
@@ -56,7 +56,7 @@ class ActionVisualizer(Visualizer):
         >>> import torch
         >>> import decord
         >>> from pathlib import Path
-        >>> from mmaction.core import ActionDataSample, ActionVisualizer
+        >>> from mmaction.structures import ActionDataSample, ActionVisualizer
         >>> from mmengine.structures import LabelData
         >>> # Example frame
         >>> video = decord.VideoReader('./demo/demo.mp4')
diff --git a/model-index.yml b/model-index.yml
index 83f7862e57..d649fa2610 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -1,29 +1,36 @@
 Import:
-- configs/recognition/csn/metafile.yml
-- configs/recognition/i3d/metafile.yml
-- configs/recognition/r2plus1d/metafile.yml
-- configs/recognition/slowfast/metafile.yml
-- configs/recognition/slowonly/metafile.yml
-- configs/recognition/timesformer/metafile.yml
-- configs/recognition/tin/metafile.yml
-- configs/recognition/tpn/metafile.yml
-- configs/recognition/tsm/metafile.yml
-- configs/recognition/tsn/metafile.yml
-- configs/recognition/c3d/metafile.yml
-- configs/recognition/tanet/metafile.yml
-- configs/recognition/x3d/metafile.yml
-- configs/recognition/trn/metafile.yml
-- configs/recognition/swin/metafile.yml
-- configs/recognition/c2d/metafile.yml
-- configs/recognition/omnisource/metafile.yml
-- configs/recognition/mvit/metafile.yml
-- configs/recognition/uniformer/metafile.yml
-- configs/recognition/uniformerv2/metafile.yml
-- configs/recognition/videomae/metafile.yml
-- configs/detection/slowfast/metafile.yml
-- configs/detection/slowonly/metafile.yml
-- configs/detection/acrn/metafile.yml
-- configs/skeleton/stgcn/metafile.yml
-- configs/skeleton/2s-agcn/metafile.yml
-- configs/skeleton/stgcnpp/metafile.yml
-- configs/skeleton/posec3d/metafile.yml
+  - configs/detection/acrn/metafile.yml
+  - configs/detection/lfb/metafile.yml
+  - configs/detection/slowfast/metafile.yml
+  - configs/detection/slowonly/metafile.yml
+  - configs/detection/videomae/metafile.yml
+  - configs/recognition/c2d/metafile.yml
+  - configs/recognition/c3d/metafile.yml
+  - configs/recognition/csn/metafile.yml
+  - configs/recognition/i3d/metafile.yml
+  - configs/recognition/mvit/metafile.yml
+  - configs/recognition/omnisource/metafile.yml
+  - configs/recognition/r2plus1d/metafile.yml
+  - configs/recognition/slowfast/metafile.yml
+  - configs/recognition/slowonly/metafile.yml
+  - configs/recognition/swin/metafile.yml
+  - configs/recognition/tanet/metafile.yml
+  - configs/recognition/timesformer/metafile.yml
+  - configs/recognition/tin/metafile.yml
+  - configs/recognition/tpn/metafile.yml
+  - configs/recognition/trn/metafile.yml
+  - configs/recognition/tsm/metafile.yml
+  - configs/recognition/tsn/metafile.yml
+  - configs/recognition/uniformer/metafile.yml
+  - configs/recognition/uniformerv2/metafile.yml
+  - configs/recognition/videomae/metafile.yml
+  - configs/recognition/videomaev2/metafile.yml
+  - configs/recognition/x3d/metafile.yml
+  - configs/recognition_audio/resnet/metafile.yml
+  - configs/localization/bmn/metafile.yml
+  - configs/localization/bsn/metafile.yml
+  - configs/retrieval/clip4clip/metafile.yml
+  - configs/skeleton/2s-agcn/metafile.yml
+  - configs/skeleton/posec3d/metafile.yml
+  - configs/skeleton/stgcn/metafile.yml
+  - configs/skeleton/stgcnpp/metafile.yml
diff --git a/projects/README.md b/projects/README.md
index 5117db6b1e..4b0195efd7 100644
--- a/projects/README.md
+++ b/projects/README.md
@@ -1,8 +1,8 @@
 # Welcome to Projects of MMAction2
 
-In this folder, we welcome all contribution of deep-learning video understanding models from community.
+In this folder, we welcome all contributions of deep-learning video understanding models from the community.
 
-Here, these requirements, e.g., code standards, are not that strict as in the core package. Thus, developers from the community can implement their algorithms much more easily and efficiently in MMAction2. We appreciate all contributions from community to make MMAction2 greater.
+Here, these requirements, e.g., code standards, are not as strict as in the core package. Thus, developers from the community can implement their algorithms much more easily and efficiently in MMAction2. We appreciate all contributions from the community to make MMAction2 greater.
 
 Here is an [example project](./example_project) about how to add your algorithms easily.
 
diff --git a/projects/actionclip/README.md b/projects/actionclip/README.md
new file mode 100644
index 0000000000..a16b44e249
--- /dev/null
+++ b/projects/actionclip/README.md
@@ -0,0 +1,157 @@
+# ActionCLIP Project
+
+[ActionCLIP: A New Paradigm for Video Action Recognition](https://arxiv.org/abs/2109.08472)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The canonical approach to video action recognition dictates a neural model to do a classic and standard 1-of-N majority vote task. They are trained to predict a fixed set of predefined categories, limiting their transferable ability on new datasets with unseen concepts. In this paper, we provide a new perspective on action recognition by attaching importance to the semantic information of label texts rather than simply mapping them into numbers. Specifically, we model this task as a video-text matching problem within a multimodal learning framework, which strengthens the video representation with more semantic language supervision and enables our model to do zero-shot action recognition without any further labeled data or parameters requirements. Moreover, to handle the deficiency of label texts and make use of tremendous web data, we propose a new paradigm based on this multimodal learning framework for action recognition, which we dub "pre-train, prompt and fine-tune". This paradigm first learns powerful representations from pre-training on a large amount of web image-text or video-text data. Then it makes the action recognition task to act more like pre-training problems via prompt engineering. Finally, it end-to-end fine-tunes on target datasets to obtain strong performance. We give an instantiation of the new paradigm, ActionCLIP, which not only has superior and flexible zero-shot/few-shot transfer ability but also reaches a top performance on general action recognition task, achieving 83.8% top-1 accuracy on Kinetics-400 with a ViT-B/16 as the backbone.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github-production-user-asset-6210df.s3.amazonaws.com/58767402/237413093-75d76018-0521-4642-af68-32141fb4fed1.png" width="800"/>
+</div>
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2. Run the following command to install `clip`.
+
+```shell
+pip install git+https://github.com/openai/CLIP.git
+```
+
+Assume that you are located at `$MMACTION2/projects/actionclip`.
+
+Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md).
+
+Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link.
+
+```shell
+ln -s ../../data ./data
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+### Kinetics400
+
+| frame sampling strategy | backbone | top1 acc | top5 acc |  testing protocol  |                                config                                |                                ckpt                                 |
+| :---------------------: | :------: | :------: | :------: | :----------------: | :------------------------------------------------------------------: | :-----------------------------------------------------------------: |
+|          1x1x8          | ViT-B/32 |   77.6   |   93.8   | 8 clips  x 1 crop  | [config](./configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb/vit-b-32-8f.pth)\[1\] |
+|          1x1x8          | ViT-B/16 |   80.3   |   95.2   | 8 clips  x 1 crop  | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb/vit-b-16-8f.pth)\[1\] |
+|         1x1x16          | ViT-B/16 |   81.1   |   95.6   | 16 clips  x 1 crop | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb/vit-b-16-16f.pth)\[1\] |
+|         1x1x32          | ViT-B/16 |   81.3   |   95.8   | 32 clips  x 1 crop | [config](./configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb/vit-b-16-32f.pth)\[1\] |
+
+\[1\] The models are ported from the repo [ActionCLIP](https://github.com/sallymmx/ActionCLIP) and tested on our data. Currently, we only support the testing of ActionCLIP models. Due to the variation in testing data, our reported test accuracy differs from that of the original repository (on average, it is lower by one point). Please refer to this [issue](https://github.com/sallymmx/ActionCLIP/issues/14) for more details.
+
+## Zero-Shot Prediction
+
+We offer two methods for zero-shot prediction as follows. The `test.mp4` can be downloaded from [here](https://github-production-user-asset-6210df.s3.amazonaws.com/58767402/237333525-89ebee9a-573e-4e27-9047-0ad6422fa82f.mp4).
+
+### Using Naive Pytorch
+
+```python
+import torch
+import clip
+from models.load import init_actionclip
+from mmaction.utils import register_all_modules
+
+register_all_modules(True)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = init_actionclip('ViT-B/32-8', device=device)
+
+video_anno = dict(filename='test.mp4', start_index=0)
+video = preprocess(video_anno).unsqueeze(0).to(device)
+
+template = 'The woman is {}'
+labels = ['singing', 'dancing', 'performing']
+text = clip.tokenize([template.format(label) for label in labels]).to(device)
+
+with torch.no_grad():
+    video_features = model.encode_video(video)
+    text_features = model.encode_text(text)
+
+video_features /= video_features.norm(dim=-1, keepdim=True)
+text_features /= text_features.norm(dim=-1, keepdim=True)
+similarity = (100 * video_features @ text_features.T).softmax(dim=-1)
+probs = similarity.cpu().numpy()
+
+print("Label probs:", probs)  # [[9.995e-01 5.364e-07 6.666e-04]]
+```
+
+### Using MMAction2 APIs
+
+```python
+import mmengine
+from mmaction.utils import register_all_modules
+from mmaction.apis import inference_recognizer, init_recognizer
+
+register_all_modules(True)
+
+config_path = 'configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py'
+checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb/vit-b-32-8f.pth'
+template = 'The woman is {}'
+labels = ['singing', 'dancing', 'performing']
+
+# Update the labels, the default is the label list of K400.
+config = mmengine.Config.fromfile(config_path)
+config.model.labels_or_label_file = labels
+config.model.template = template
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = init_recognizer(config=config, checkpoint=checkpoint_path, device=device)
+
+pred_result = inference_recognizer(model, 'test.mp4')
+probs = pred_result.pred_scores.item.cpu().numpy()
+print("Label probs:", probs)  # [9.995e-01 5.364e-07 6.666e-04]
+```
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@article{wang2021actionclip,
+  title={Actionclip: A new paradigm for video action recognition},
+  author={Wang, Mengmeng and Xing, Jiazheng and Liu, Yong},
+  journal={arXiv preprint arXiv:2109.08472},
+  year={2021}
+}
+```
diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py
new file mode 100644
index 0000000000..5150f082cc
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb.py
@@ -0,0 +1,52 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+num_segs = 16
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/16',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type='AccMetric')
diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py
new file mode 100644
index 0000000000..08d3cf0747
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb.py
@@ -0,0 +1,52 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+num_segs = 32
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/16',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type='AccMetric')
diff --git a/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py
new file mode 100644
index 0000000000..1ae1ca67db
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb.py
@@ -0,0 +1,52 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+num_segs = 8
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/16',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type='AccMetric')
diff --git a/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py
new file mode 100644
index 0000000000..f39b45276b
--- /dev/null
+++ b/projects/actionclip/configs/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb.py
@@ -0,0 +1,52 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+num_segs = 8
+
+model = dict(
+    type='ActionClip',
+    clip_arch='ViT-B/32',
+    num_adapter_segs=num_segs,
+    num_adapter_layers=6,
+    labels_or_label_file='configs/label_map_k400.txt',
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[122.771, 116.746, 104.093],
+        std=[68.500, 66.632, 70.323],
+        format_shape='NCHW'))
+
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=num_segs,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type='AccMetric')
diff --git a/projects/actionclip/configs/label_map_k400.txt b/projects/actionclip/configs/label_map_k400.txt
new file mode 100644
index 0000000000..cdaafcb141
--- /dev/null
+++ b/projects/actionclip/configs/label_map_k400.txt
@@ -0,0 +1,400 @@
+abseiling
+air drumming
+answering questions
+applauding
+applying cream
+archery
+arm wrestling
+arranging flowers
+assembling computer
+auctioning
+baby waking up
+baking cookies
+balloon blowing
+bandaging
+barbequing
+bartending
+beatboxing
+bee keeping
+belly dancing
+bench pressing
+bending back
+bending metal
+biking through snow
+blasting sand
+blowing glass
+blowing leaves
+blowing nose
+blowing out candles
+bobsledding
+bookbinding
+bouncing on trampoline
+bowling
+braiding hair
+breading or breadcrumbing
+breakdancing
+brush painting
+brushing hair
+brushing teeth
+building cabinet
+building shed
+bungee jumping
+busking
+canoeing or kayaking
+capoeira
+carrying baby
+cartwheeling
+carving pumpkin
+catching fish
+catching or throwing baseball
+catching or throwing frisbee
+catching or throwing softball
+celebrating
+changing oil
+changing wheel
+checking tires
+cheerleading
+chopping wood
+clapping
+clay pottery making
+clean and jerk
+cleaning floor
+cleaning gutters
+cleaning pool
+cleaning shoes
+cleaning toilet
+cleaning windows
+climbing a rope
+climbing ladder
+climbing tree
+contact juggling
+cooking chicken
+cooking egg
+cooking on campfire
+cooking sausages
+counting money
+country line dancing
+cracking neck
+crawling baby
+crossing river
+crying
+curling hair
+cutting nails
+cutting pineapple
+cutting watermelon
+dancing ballet
+dancing charleston
+dancing gangnam style
+dancing macarena
+deadlifting
+decorating the christmas tree
+digging
+dining
+disc golfing
+diving cliff
+dodgeball
+doing aerobics
+doing laundry
+doing nails
+drawing
+dribbling basketball
+drinking
+drinking beer
+drinking shots
+driving car
+driving tractor
+drop kicking
+drumming fingers
+dunking basketball
+dying hair
+eating burger
+eating cake
+eating carrots
+eating chips
+eating doughnuts
+eating hotdog
+eating ice cream
+eating spaghetti
+eating watermelon
+egg hunting
+exercising arm
+exercising with an exercise ball
+extinguishing fire
+faceplanting
+feeding birds
+feeding fish
+feeding goats
+filling eyebrows
+finger snapping
+fixing hair
+flipping pancake
+flying kite
+folding clothes
+folding napkins
+folding paper
+front raises
+frying vegetables
+garbage collecting
+gargling
+getting a haircut
+getting a tattoo
+giving or receiving award
+golf chipping
+golf driving
+golf putting
+grinding meat
+grooming dog
+grooming horse
+gymnastics tumbling
+hammer throw
+headbanging
+headbutting
+high jump
+high kick
+hitting baseball
+hockey stop
+holding snake
+hopscotch
+hoverboarding
+hugging
+hula hooping
+hurdling
+hurling (sport)
+ice climbing
+ice fishing
+ice skating
+ironing
+javelin throw
+jetskiing
+jogging
+juggling balls
+juggling fire
+juggling soccer ball
+jumping into pool
+jumpstyle dancing
+kicking field goal
+kicking soccer ball
+kissing
+kitesurfing
+knitting
+krumping
+laughing
+laying bricks
+long jump
+lunge
+making a cake
+making a sandwich
+making bed
+making jewelry
+making pizza
+making snowman
+making sushi
+making tea
+marching
+massaging back
+massaging feet
+massaging legs
+massaging person's head
+milking cow
+mopping floor
+motorcycling
+moving furniture
+mowing lawn
+news anchoring
+opening bottle
+opening present
+paragliding
+parasailing
+parkour
+passing American football (in game)
+passing American football (not in game)
+peeling apples
+peeling potatoes
+petting animal (not cat)
+petting cat
+picking fruit
+planting trees
+plastering
+playing accordion
+playing badminton
+playing bagpipes
+playing basketball
+playing bass guitar
+playing cards
+playing cello
+playing chess
+playing clarinet
+playing controller
+playing cricket
+playing cymbals
+playing didgeridoo
+playing drums
+playing flute
+playing guitar
+playing harmonica
+playing harp
+playing ice hockey
+playing keyboard
+playing kickball
+playing monopoly
+playing organ
+playing paintball
+playing piano
+playing poker
+playing recorder
+playing saxophone
+playing squash or racquetball
+playing tennis
+playing trombone
+playing trumpet
+playing ukulele
+playing violin
+playing volleyball
+playing xylophone
+pole vault
+presenting weather forecast
+pull ups
+pumping fist
+pumping gas
+punching bag
+punching person (boxing)
+push up
+pushing car
+pushing cart
+pushing wheelchair
+reading book
+reading newspaper
+recording music
+riding a bike
+riding camel
+riding elephant
+riding mechanical bull
+riding mountain bike
+riding mule
+riding or walking with horse
+riding scooter
+riding unicycle
+ripping paper
+robot dancing
+rock climbing
+rock scissors paper
+roller skating
+running on treadmill
+sailing
+salsa dancing
+sanding floor
+scrambling eggs
+scuba diving
+setting table
+shaking hands
+shaking head
+sharpening knives
+sharpening pencil
+shaving head
+shaving legs
+shearing sheep
+shining shoes
+shooting basketball
+shooting goal (soccer)
+shot put
+shoveling snow
+shredding paper
+shuffling cards
+side kick
+sign language interpreting
+singing
+situp
+skateboarding
+ski jumping
+skiing (not slalom or crosscountry)
+skiing crosscountry
+skiing slalom
+skipping rope
+skydiving
+slacklining
+slapping
+sled dog racing
+smoking
+smoking hookah
+snatch weight lifting
+sneezing
+sniffing
+snorkeling
+snowboarding
+snowkiting
+snowmobiling
+somersaulting
+spinning poi
+spray painting
+spraying
+springboard diving
+squat
+sticking tongue out
+stomping grapes
+stretching arm
+stretching leg
+strumming guitar
+surfing crowd
+surfing water
+sweeping floor
+swimming backstroke
+swimming breast stroke
+swimming butterfly stroke
+swing dancing
+swinging legs
+swinging on something
+sword fighting
+tai chi
+taking a shower
+tango dancing
+tap dancing
+tapping guitar
+tapping pen
+tasting beer
+tasting food
+testifying
+texting
+throwing axe
+throwing ball
+throwing discus
+tickling
+tobogganing
+tossing coin
+tossing salad
+training dog
+trapezing
+trimming or shaving beard
+trimming trees
+triple jump
+tying bow tie
+tying knot (not on a tie)
+tying tie
+unboxing
+unloading truck
+using computer
+using remote controller (not gaming)
+using segway
+vault
+waiting in line
+walking the dog
+washing dishes
+washing feet
+washing hair
+washing hands
+water skiing
+water sliding
+watering plants
+waxing back
+waxing chest
+waxing eyebrows
+waxing legs
+weaving basket
+welding
+whistling
+windsurfing
+wrapping present
+wrestling
+writing
+yawning
+yoga
+zumba
diff --git a/projects/actionclip/models/__init__.py b/projects/actionclip/models/__init__.py
new file mode 100644
index 0000000000..7877a55c5d
--- /dev/null
+++ b/projects/actionclip/models/__init__.py
@@ -0,0 +1,4 @@
+from .actionclip import ActionClip
+from .load import init_actionclip
+
+__all__ = ['ActionClip', 'init_actionclip']
diff --git a/projects/actionclip/models/actionclip.py b/projects/actionclip/models/actionclip.py
new file mode 100644
index 0000000000..923b78c68f
--- /dev/null
+++ b/projects/actionclip/models/actionclip.py
@@ -0,0 +1,117 @@
+from typing import Dict, List, Optional, Union
+
+import clip
+import mmengine
+import torch
+import torch.nn.functional as F
+from mmengine.model import BaseModel
+from mmengine.structures import LabelData
+
+from mmaction.registry import MODELS
+from .adapter import TransformerAdapter
+
+
+def text_prompt(labels_or_label_file, template=None):
+    if isinstance(labels_or_label_file, str):
+        labels = mmengine.list_from_file(labels_or_label_file)
+    elif isinstance(labels_or_label_file, list):
+        labels = labels_or_label_file
+    else:
+        raise ValueError(f'`labels_or_label_file` must be `list` or `str`, '
+                         f'but got {type(labels_or_label_file)}')
+
+    if template is None:
+        template = [
+            'a photo of action {}', 'a picture of action {}',
+            'Human action of {}', '{}, an action', '{} this is an action',
+            '{}, a video of action', 'Playing action of {}', '{}',
+            'Playing a kind of action, {}', 'Doing a kind of action, {}',
+            'Look, the human is {}', 'Can you recognize the action of {}?',
+            'Video classification of {}', 'A video of {}', 'The man is {}',
+            'The woman is {}'
+        ]
+    elif isinstance(template, str):
+        template = [template]
+    elif not mmengine.is_seq_of(template, str):
+        raise ValueError(f'`template` must be list of `str`, `str` or `None`, '
+                         f'but got {type(template)}')
+
+    num_prompt = len(template)
+    prompt = torch.cat(
+        [clip.tokenize(t.format(c)) for t in template for c in labels])
+    return prompt, num_prompt
+
+
+@MODELS.register_module()
+class ActionClip(BaseModel):
+
+    def __init__(self,
+                 clip_arch: str,
+                 num_adapter_segs: int,
+                 num_adapter_layers: int = 6,
+                 labels_or_label_file: Optional[Union[List[str], str]] = None,
+                 template: Optional[Union[List[str], str]] = None,
+                 data_preprocessor: Optional[Dict] = None):
+        super(ActionClip, self).__init__(data_preprocessor=data_preprocessor)
+        self.clip = clip.load(clip_arch)[0]
+        self.adapter = TransformerAdapter(self.clip, num_adapter_segs,
+                                          num_adapter_layers)
+
+        if labels_or_label_file is not None:
+            self.prompt, self.num_prompt = text_prompt(labels_or_label_file,
+                                                       template)
+            self.text_features = None
+
+    def encode_video(self, video):
+        b, n, c, h, w = video.shape
+        video = video.view(-1, c, h, w)
+        frames_features = self.encode_image(video)
+        frames_features = frames_features.view(b, n, -1)
+        video_features = self.adapter(frames_features)
+        return video_features
+
+    def encode_image(self, image):
+        return self.clip.encode_image(image)
+
+    def encode_text(self, text):
+        return self.clip.encode_text(text)
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[List] = None,
+                mode: str = 'tensor'):
+
+        if mode == 'tensor':
+            return self.encode_video(inputs)
+
+        elif mode == 'predict':
+            assert hasattr(self, 'prompt'),\
+                '`labels_or_label_file` is required to perform prediction. '
+
+            video_features = self.encode_video(inputs)
+            video_features = video_features / video_features.norm(
+                dim=-1, keepdim=True)
+
+            bsz = len(data_samples)
+            num_views = video_features.shape[0] // bsz
+
+            if self.text_features is None:
+                text_features = self.encode_text(self.prompt.to(inputs.device))
+                self.text_features = text_features / text_features.norm(
+                    dim=-1, keepdim=True)
+
+            # (bsz*num_views, num_prompt, num_classes) ->
+            # (bsz, num_views*num_prompt, num_classes)
+            similarity = (100.0 * video_features @ self.text_features.T). \
+                view(bsz, num_views * self.num_prompt, -1)
+
+            cls_scores = F.softmax(similarity, dim=2).mean(dim=1)
+
+            for data_sample, score in zip(data_samples, cls_scores):
+                data_sample.pred_scores = LabelData(item=score)
+
+            return data_samples
+
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports `predict` and `tensor` mode. ')
diff --git a/projects/actionclip/models/adapter.py b/projects/actionclip/models/adapter.py
new file mode 100644
index 0000000000..59cd8b4f86
--- /dev/null
+++ b/projects/actionclip/models/adapter.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from clip.model import Transformer
+from mmengine.model import BaseModule
+
+
+class TransformerAdapter(BaseModule):
+
+    def __init__(self,
+                 clip_model: nn.Module,
+                 num_segs: int,
+                 num_layers: int = 6):
+        super(TransformerAdapter, self).__init__()
+        self.num_segs = num_segs
+
+        embed_dim = clip_model.text_projection.shape[1]
+        transformer_width = clip_model.ln_final.weight.shape[0]
+        transformer_heads = transformer_width // 64
+
+        self.frame_position_embeddings = nn.Embedding(self.num_segs, embed_dim)
+        self.transformer = Transformer(
+            width=embed_dim, layers=num_layers, heads=transformer_heads)
+
+    def init_weights(self):
+        for module in self.modules():
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                module.weight.data.normal_(mean=0.0, std=0.02)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+
+    def forward(self, x: torch.Tensor):
+        b, seq_length, c = x.size()
+
+        x_original = x
+        position_ids = torch.arange(
+            seq_length, dtype=torch.long, device=x.device)
+        embeddings = self.frame_position_embeddings(position_ids)
+        x = x + embeddings.unsqueeze(0)
+        x = x.transpose(0, 1)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.transpose(0, 1)  # LND -> NLD
+        x = x.type(x_original.dtype) + x_original
+        return x.mean(dim=1)
diff --git a/projects/actionclip/models/load.py b/projects/actionclip/models/load.py
new file mode 100644
index 0000000000..9e0183cf7d
--- /dev/null
+++ b/projects/actionclip/models/load.py
@@ -0,0 +1,72 @@
+import torch
+from mmengine.dataset import Compose
+from mmengine.runner.checkpoint import _load_checkpoint
+from torchvision.transforms import Normalize
+
+from .actionclip import ActionClip
+
+_MODELS = {
+    'ViT-B/32-8':
+    'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p32-res224-clip-pre_1x1x8_k400-rgb/vit-b-32-8f.pth',  # noqa: E501
+    'ViT-B/16-8':
+    'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x8_k400-rgb/vit-b-16-8f.pth',  # noqa: E501
+    'ViT-B/16-16':
+    'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x16_k400-rgb/vit-b-16-16f.pth',  # noqa: E501
+    'ViT-B/16-32':
+    'https://download.openmmlab.com/mmaction/v1.0/projects/actionclip/actionclip_vit-base-p16-res224-clip-pre_1x1x32_k400-rgb/vit-b-16-32f.pth',  # noqa: E501
+}
+
+
+def available_models():
+    """Returns the names of available ActionCLIP models."""
+    return list(_MODELS.keys())
+
+
+def _transform(num_segs):
+    pipeline = [
+        dict(type='DecordInit'),
+        dict(
+            type='SampleFrames',
+            clip_len=1,
+            frame_interval=1,
+            num_clips=num_segs,
+            test_mode=True),
+        dict(type='DecordDecode'),
+        dict(type='Resize', scale=(-1, 256)),
+        dict(type='CenterCrop', crop_size=224),
+        dict(type='FormatShape', input_format='NCHW'),
+        lambda x: torch.tensor(x['imgs']).div(255),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ]
+    return Compose(pipeline)
+
+
+def init_actionclip(name, device):
+    assert name in _MODELS, \
+        f'Model {name} not found; available models = {available_models()}'
+    model_path = _MODELS[name]
+
+    checkpoint = _load_checkpoint(model_path, map_location='cpu')
+    state_dict = checkpoint['state_dict']
+
+    clip_arch = name.split('-')[0] + '-' + name.split('-')[1]
+
+    num_adapter_segs = int(name.split('-')[2])
+    assert num_adapter_segs == \
+           state_dict['adapter.frame_position_embeddings.weight'].shape[0]
+    num_adapter_layers = len([
+        k for k in state_dict.keys()
+        if k.startswith('adapter.') and k.endswith('.attn.in_proj_weight')
+    ])
+
+    model = ActionClip(
+        clip_arch=clip_arch,
+        num_adapter_segs=num_adapter_segs,
+        num_adapter_layers=num_adapter_layers)
+
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+
+    return model, _transform(num_adapter_segs)
diff --git a/projects/ctrgcn/README.md b/projects/ctrgcn/README.md
index 9dda2b2365..19a8a5e746 100644
--- a/projects/ctrgcn/README.md
+++ b/projects/ctrgcn/README.md
@@ -20,9 +20,11 @@ Graph convolutional networks (GCNs) have been widely used and achieved remarkabl
 
 ### Setup Environment
 
-Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
+Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
 
-At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+Assume that you are located at `$MMACTION2/projects/ctrgcn`.
+
+Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it.
 
 > Please run it every time after you opened a new shell.
 
@@ -34,6 +36,12 @@ export PYTHONPATH=`pwd`:$PYTHONPATH
 
 Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md).
 
+Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link.
+
+```shell
+ln -s ../../data ./data
+```
+
 ### Training commands
 
 **To train with single GPU:**
@@ -103,41 +111,3 @@ mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
   year={2021}
 }
 ```
-
-## Checklist
-
-Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects.
-
-- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
-
-  - [x] Finish the code
-
-    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmaction.registry.MODELS` and configurable via a config file. -->
-
-  - [x] Basic docstrings & proper citation
-
-    <!-- Each major class should contains a docstring, describing its functionality and arguments. If your code is copied or modified from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
-
-  - [x] Converted checkpoint and results (Only for reproduction)
-
-    <!-- If you are reproducing the result from a paper, make sure the model in the project can match that results. Also please provide checkpoint links or a checkpoint conversion script for others to get the pre-trained model. -->
-
-- [x] Milestone 2: Indicates a successful model implementation.
-
-  - [x] Training results
-
-    <!-- If you are reproducing the result from a paper, train your model from scratch and verified that the final result can match the original result. Usually, ±0.1% is acceptable for the action recognition task on Kinetics400. -->
-
-- [ ] Milestone 3: Good to be a part of our core package!
-
-  - [ ] Unit tests
-
-    <!-- Unit tests for the major module are required. [Example](https://github.com/open-mmlab/mmaction2/blob/main/tests/models/backbones/test_resnet.py) -->
-
-  - [ ] Code style
-
-    <!-- Refactor your code according to reviewer's comment. -->
-
-  - [ ] `metafile.yml` and `README.md`
-
-    <!-- It will used for MMAction2 to acquire your models. [Example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/metafile.yml). In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/README.md) -->
diff --git a/projects/example_project/README.md b/projects/example_project/README.md
index 4e12a80e2f..a648f30568 100644
--- a/projects/example_project/README.md
+++ b/projects/example_project/README.md
@@ -66,9 +66,9 @@ mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-r
 
 ## Results
 
-| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc |  testing protocol  |                                     config                                      |              ckpt |             log |
-| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :-----------------------------------------------------------------------------: | ----------------: | --------------: |
-|          1x1x3          |  224x224   |  8   | ResNet50 | ImageNet |  72.83   |  90.65   | 25 clips x 10 crop | [config](./configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](CKPT-LINK) | [log](LOG-LINK) |
+| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc |  testing protocol  |                    config                     |                                   ckpt |                            log |
+| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :-------------------------------------------: | -------------------------------------: | -----------------------------: |
+|          1x1x3          |  224x224   |  8   | ResNet50 | ImageNet |  72.83   |  90.65   | 25 clips x 10 crop | [config](./configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://example/checkpoint/url) | [log](https://example/log/url) |
 
 ## Citation
 
diff --git a/projects/gesture_recognition/README.md b/projects/gesture_recognition/README.md
new file mode 100644
index 0000000000..47ca04e472
--- /dev/null
+++ b/projects/gesture_recognition/README.md
@@ -0,0 +1,33 @@
+# Gesture Recognition
+
+<!-- [ALGORITHM] -->
+
+## Introduction
+
+<!-- [ABSTRACT] -->
+
+In this project, we present a skeleton based pipeline for gesture recognition. The pipeline is three-stage. The first stage consists of a hand detection module that outputs bounding boxes of human hands from video frames. Afterwards, the second stage employs a pose estimation module to generate keypoints of the detected hands. Finally, the third stage utilizes a skeleton-based gesture recognition module to classify hand actions based on the provided hand skeleton. The three-stage pipeline is lightweight and can achieve real-time on CPU devices. In this README, we provide the models and the inference demo for the project. Training data preparation and training scripts are described in [TRAINING.md](/projects/gesture_recognition/TRAINING.md).
+
+## Hand detection stage
+
+Hand detection results on OneHand10K validation dataset
+
+| Config                                                  | Input Size | bbox mAP | bbox mAP 50 | bbox mAP 75 |                         ckpt                          |                         log                          |
+| :------------------------------------------------------ | :--------: | :------: | :---------: | :---------: | :---------------------------------------------------: | :--------------------------------------------------: |
+| [rtmpose_nano](/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py) |  320x320   |  0.8100  |   0.9870    |   0.9190    | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.log) |
+
+## Pose estimation stage
+
+Pose estimation results on COCO-WholeBody-Hand validation set
+
+| Config                                                                                                 | Input Size | PCK@0.2 |  AUC  | EPE  |                  ckpt                   |
+| :----------------------------------------------------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :-------------------------------------: |
+| [rtmpose_m](/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py) |  256x256   |  0.815  | 0.837 | 4.51 | [ckpt](https://download.openmmlab.com/) |
+
+## Gesture recognition stage
+
+Skeleton base gesture recognition results on Jester validation
+
+| Config                                                  | Input Size | Top 1 accuracy | Top 5 accuracy |                          ckpt                          |                          log                          |
+| :------------------------------------------------------ | :--------: | :------------: | :------------: | :----------------------------------------------------: | :---------------------------------------------------: |
+| [STGCNPP](/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py) |  100x17x3  |     89.22      |     97.52      | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d_20230524-fffa7ff0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.log) |
diff --git a/projects/gesture_recognition/TRAINING.md b/projects/gesture_recognition/TRAINING.md
new file mode 100644
index 0000000000..9de0a781b1
--- /dev/null
+++ b/projects/gesture_recognition/TRAINING.md
@@ -0,0 +1,89 @@
+In this document, we show how to prepare the training data and train models required for this project.
+
+# Hand detection
+
+## Data Preparation
+
+We use multiple hand pose estimation datasets to generate a hand detection dataset. The circumscribed rectangle of hand key points of is used as the detection bounding box of the hand. In our demo, we use 4 datasets supported from [MMPose](https://github.com/open-mmlab/mmpose): [FreiHAND Dataset](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_hand_keypoint.html#freihand-dataset), [OneHand10K](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_hand_keypoint.html#onehand10k), [RHD Dataset](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_hand_keypoint.html#rhd-dataset) and [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe). You can find instructions for preparing each dataset from the corresponding link.
+
+To train the hand detection model, you need to install [MMDet](https://github.com/open-mmlab/mmdetection) and move (or link) the above datasets to `$MMDet/data/`. The folder structure should look like this:
+
+```
+mmdetection
+├── mmdetection
+├── docs
+├── tests
+├── tools
+├── configs
+|── data
+    |-- freihand
+       │-- annotations
+       │-- ..
+    |-- onehand10k
+       │-- annotations
+       │-- ..
+    |-- rhd
+       │-- annotations
+       │-- ..
+    │-- halpe
+       │-- annotations
+       |-- hico_20160224_det
+          │-- images
+          |-- ..
+       │-- ..
+```
+
+We provide a [parse_pose.py](/projects/gesture_recognition/parse_pose.py) file to convert the annotation files of the above pose datasets to a COCO-style detection annotation. Suppose you are at `$MMDet/data`, run the following command and it will generate `hand_det_train.json` and `hand_det_val.json` at `$MMDet/data/hand_det/`
+
+```
+python3 $MMAction/projects/gesture_recognition/parse_pose.py
+```
+
+The training annotation file combines the above four data sets, and the validation annotation file just uses the OneHand10K validation for a quick verification. You can also add more hand detection datasets to improve performance. Now we are done with data preparation.
+
+## Training and inference
+
+We provide a [config](/projects/gesture_recognition/configs/rtmdet_nano_320-8xb32_multi-dataset-hand.py) to train a [RTMDet](https://arxiv.org/abs/2212.07784) detection model. Suppose you are at `$MMDet`, you can run the follow command to train the hand detection model with 8 GPUs:
+
+```bash
+bash tools/dist_train.sh $MMAction/projects/gesture_recognition/configs/rtmdet_nano_320-8xb32_multi-dataset-hand.py 8
+```
+
+To see the detection result for a single image, we can use `$MMDet/demo/image_demo.py`. The follow command will do inference on a single [image](/projects/gesture_recognition/demo/hand_det.jpg) (from a video in the [jester dataset](/tools/data/jester)) and the output should be similar to [this image](/projects/gesture_recognition/demo/hand_det_out.jpg).
+
+```bash
+python3 $MMDet/demo/image_demo.py $MMAction/projects/gesture_recognition/demo/hand_det.jpg PATH_TO_HAND_DET_CHECKPOINT --out-dir='.'
+```
+
+# Pose estimation
+
+We directly use the pose estimation model from MMPose. Please refer to [RTMPose](https://github.com/open-mmlab/mmpose/tree/main/configs/hand_2d_keypoint/rtmpose) for details.
+
+# Gesture recognition
+
+## Data Preparation
+
+We use the [jester dataset](/tools/data/jester)) to train a skeleton based gesture recognition model. Please follow the link to prepare this dataset (in frames).
+
+Once we have the jester dataset, we provide the [extract_keypoint.py](/projects/gesture_recognition/extract_keypoint.py) to extract the hand keypoints for all video frames in the dataset. This step requires the hand detection model and the pose estimation model in the above two stages. Here is an example to extract the keypoints for the dataset. You may need to modify the path to the dataset, configs or checkpoints according to your system.
+
+```bash
+ROOT_TO_JESTER='20bn-jester-v1'
+POSE_CONFIG='rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py'
+POSE_CKPT='rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth'
+DET_CONFIG='rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py'
+DET_CKPT='hand-cocktail5-4e-4-bs256-210e-b74fb594_20230320.pth'
+python3 -u extract_keypoint.py $ROOT_TO_JESTER \
+    --pose_config $POSE_CONFIG --pose_ckpt $POSE_CKPT \
+    --det_config $DET_CONFIG --det-ckpt $DET_CKPT
+```
+
+The program will generate a `jester.pkl` file in your current directory. Then move this file to `$MMAction`. We will use this file for skeleton based gesture recognition training.
+
+## Training and inference
+
+We provide a [config](/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py) to train a STGCN++ model. Suppose you are at `$MMAction`, you can run the follow command to train the model with 8 GPUs:
+
+```bash
+bash tools/dist_train.sh $MMAction/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-80e_jester-keypoint-2d.py 8
+```
diff --git a/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py b/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py
new file mode 100644
index 0000000000..b581150f99
--- /dev/null
+++ b/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py
@@ -0,0 +1,123 @@
+_base_ = 'mmdet::rtmdet/rtmdet_nano_8xb32-300e_coco.py'
+
+input_shape = 320
+
+model = dict(
+    backbone=dict(
+        deepen_factor=0.33,
+        widen_factor=0.25,
+        use_depthwise=True,
+    ),
+    neck=dict(
+        in_channels=[64, 128, 256],
+        out_channels=64,
+        num_csp_blocks=1,
+        use_depthwise=True,
+    ),
+    bbox_head=dict(
+        in_channels=64,
+        feat_channels=64,
+        share_conv=False,
+        exp_on_reg=False,
+        use_depthwise=True,
+        num_classes=1),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+data_root = 'data/'
+file_client_args = dict(backend='disk')
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='CachedMosaic',
+        img_scale=(input_shape, input_shape),
+        pad_val=114.0,
+        max_cached_images=20,
+        random_pop=False),
+    dict(
+        type='RandomResize',
+        scale=(input_shape * 2, input_shape * 2),
+        ratio_range=(0.5, 1.5),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(input_shape, input_shape)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Pad',
+        size=(input_shape, input_shape),
+        pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=(input_shape, input_shape),
+        ratio_range=(0.5, 1.5),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(input_shape, input_shape)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Pad',
+        size=(input_shape, input_shape),
+        pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='Resize', scale=(input_shape, input_shape), keep_ratio=True),
+    dict(
+        type='Pad',
+        size=(input_shape, input_shape),
+        pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='hand_det/hand_det_train.json',
+        data_prefix=dict(img=''),
+        pipeline=train_pipeline,
+        metainfo=dict(classes=('hand', )),
+    ))
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='hand_det/hand_det_val.json',
+        data_prefix=dict(img=''),
+        pipeline=test_pipeline,
+        metainfo=dict(classes=('hand', )),
+    ))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'hand_det/hand_det_val.json')
+test_evaluator = val_evaluator
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py b/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000000..927dac3552
--- /dev/null
+++ b/projects/gesture_recognition/configs/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,339 @@
+default_scope = 'mmpose'
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=10,
+        save_best='AUC',
+        rule='greater',
+        max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='PoseVisualizationHook', enable=False))
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=180,
+        switch_pipeline=[
+            dict(type='LoadImage', file_client_args=dict(backend='disk')),
+            dict(type='GetBBoxCenterScale'),
+            dict(
+                type='RandomBBoxTransform',
+                shift_factor=0.0,
+                scale_factor=[0.75, 1.25],
+                rotate_factor=180),
+            dict(type='RandomFlip', direction='horizontal'),
+            dict(type='TopdownAffine', input_size=(256, 256)),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                type='Albumentation',
+                transforms=[
+                    dict(type='Blur', p=0.1),
+                    dict(type='MedianBlur', p=0.1),
+                    dict(
+                        type='CoarseDropout',
+                        max_holes=1,
+                        max_height=0.4,
+                        max_width=0.4,
+                        min_holes=1,
+                        min_height=0.2,
+                        min_width=0.2,
+                        p=0.5)
+                ]),
+            dict(
+                type='GenerateTarget',
+                encoder=dict(
+                    type='SimCCLabel',
+                    input_size=(256, 256),
+                    sigma=(5.66, 5.66),
+                    simcc_split_ratio=2.0,
+                    normalize=False,
+                    use_dark=False)),
+            dict(type='PackPoseInputs')
+        ])
+]
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend')],
+    name='visualizer')
+log_processor = dict(
+    type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
+log_level = 'INFO'
+load_from = None
+resume = False
+file_client_args = dict(backend='disk')
+train_cfg = dict(by_epoch=True, max_epochs=210, val_interval=10)
+val_cfg = dict()
+test_cfg = dict()
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 0.004
+randomness = dict(seed=21)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-05, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0.0002,
+        begin=105,
+        end=210,
+        T_max=105,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+auto_scale_lr = dict(base_batch_size=256)
+codec = dict(
+    type='SimCCLabel',
+    input_size=(256, 256),
+    sigma=(5.66, 5.66),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=0.67,
+        widen_factor=0.75,
+        out_indices=(4, ),
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint=('https://download.openmmlab.com/mmpose/v1/projects/'
+                        'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-'
+                        'f2f7d6f6_20230130.pth'))),
+    head=dict(
+        type='RTMCCHead',
+        in_channels=768,
+        out_channels=21,
+        input_size=(256, 256),
+        in_featuremap_size=(8, 8),
+        simcc_split_ratio=2.0,
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.0,
+            drop_path=0.0,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.0,
+            label_softmax=True),
+        decoder=dict(
+            type='SimCCLabel',
+            input_size=(256, 256),
+            sigma=(5.66, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    test_cfg=dict(flip_test=True))
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+train_pipeline = [
+    dict(type='LoadImage', file_client_args=dict(backend='disk')),
+    dict(type='GetBBoxCenterScale'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.5, 1.5],
+        rotate_factor=180),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='TopdownAffine', input_size=(256, 256)),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0)
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=dict(
+            type='SimCCLabel',
+            input_size=(256, 256),
+            sigma=(5.66, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', file_client_args=dict(backend='disk')),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=(256, 256)),
+    dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImage', file_client_args=dict(backend='disk')),
+    dict(type='GetBBoxCenterScale'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.0,
+        scale_factor=[0.75, 1.25],
+        rotate_factor=180),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='TopdownAffine', input_size=(256, 256)),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5)
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=dict(
+            type='SimCCLabel',
+            input_size=(256, 256),
+            sigma=(5.66, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    dict(type='PackPoseInputs')
+]
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CocoWholeBodyHandDataset',
+        data_root='data/coco/',
+        data_mode='topdown',
+        ann_file='annotations/coco_wholebody_train_v1.0.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=[
+            dict(type='LoadImage', file_client_args=dict(backend='disk')),
+            dict(type='GetBBoxCenterScale'),
+            dict(
+                type='RandomBBoxTransform',
+                scale_factor=[0.5, 1.5],
+                rotate_factor=180),
+            dict(type='RandomFlip', direction='horizontal'),
+            dict(type='TopdownAffine', input_size=(256, 256)),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                type='Albumentation',
+                transforms=[
+                    dict(type='Blur', p=0.1),
+                    dict(type='MedianBlur', p=0.1),
+                    dict(
+                        type='CoarseDropout',
+                        max_holes=1,
+                        max_height=0.4,
+                        max_width=0.4,
+                        min_holes=1,
+                        min_height=0.2,
+                        min_width=0.2,
+                        p=1.0)
+                ]),
+            dict(
+                type='GenerateTarget',
+                encoder=dict(
+                    type='SimCCLabel',
+                    input_size=(256, 256),
+                    sigma=(5.66, 5.66),
+                    simcc_split_ratio=2.0,
+                    normalize=False,
+                    use_dark=False)),
+            dict(type='PackPoseInputs')
+        ]))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoWholeBodyHandDataset',
+        data_root='data/coco/',
+        data_mode='topdown',
+        ann_file='annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImage', file_client_args=dict(backend='disk')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='TopdownAffine', input_size=(256, 256)),
+            dict(type='PackPoseInputs')
+        ]))
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoWholeBodyHandDataset',
+        data_root='data/coco/',
+        data_mode='topdown',
+        ann_file='annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImage', file_client_args=dict(backend='disk')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='TopdownAffine', input_size=(256, 256)),
+            dict(type='PackPoseInputs')
+        ]))
+val_evaluator = [
+    dict(type='PCKAccuracy', thr=0.2),
+    dict(type='AUC'),
+    dict(type='EPE')
+]
+test_evaluator = [
+    dict(type='PCKAccuracy', thr=0.2),
+    dict(type='AUC'),
+    dict(type='EPE')
+]
diff --git a/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py b/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py
new file mode 100644
index 0000000000..07f5ec57fe
--- /dev/null
+++ b/projects/gesture_recognition/configs/stgcnpp_8xb16-joint-u100-16e_jester-keypoint-2d.py
@@ -0,0 +1,113 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+hand_layout = dict(
+    num_node=17,
+    inward=[(15, 13), (13, 11), (16, 14), (14, 12), (11, 5), (12, 6), (9, 7),
+            (7, 5), (10, 8), (8, 6), (5, 0), (6, 0), (1, 0), (3, 1), (2, 0),
+            (4, 2)],
+    center=0)
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='STGCN',
+        gcn_adaptive='init',
+        gcn_with_res=True,
+        tcn_type='mstcn',
+        graph_cfg=dict(layout=hand_layout, mode='spatial')),
+    cls_head=dict(type='GCNHead', num_classes=27, in_channels=256))
+
+dataset_type = 'PoseDataset'
+ann_file = 'jester.pkl'
+train_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=1),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=1),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=1),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/gesture_recognition/demo/hand_det.jpg b/projects/gesture_recognition/demo/hand_det.jpg
new file mode 100644
index 0000000000..c12616fc49
Binary files /dev/null and b/projects/gesture_recognition/demo/hand_det.jpg differ
diff --git a/projects/gesture_recognition/demo/hand_det_out.jpg b/projects/gesture_recognition/demo/hand_det_out.jpg
new file mode 100644
index 0000000000..2f3c1bed41
Binary files /dev/null and b/projects/gesture_recognition/demo/hand_det_out.jpg differ
diff --git a/projects/gesture_recognition/extract_keypoint.py b/projects/gesture_recognition/extract_keypoint.py
new file mode 100644
index 0000000000..53cf311575
--- /dev/null
+++ b/projects/gesture_recognition/extract_keypoint.py
@@ -0,0 +1,115 @@
+import copy
+import os
+import pickle
+import time
+from argparse import ArgumentParser
+
+import cv2
+import numpy as np
+import torch
+from mmdet.apis import init_detector
+from mmengine.dataset import Compose, pseudo_collate
+from mmengine.registry import init_default_scope
+from mmpose.apis import init_model
+from PIL import Image
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('root', help='Video folder root')
+    parser.add_argument('--pose_config', help='Pose config file')
+    parser.add_argument('--pose_ckpt', help='Pose checkpoint file')
+    parser.add_argument('--det_config', help='Hand detection config file')
+    parser.add_argument('--det_ckpt', help='Hand detection checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    args = parser.parse_args()
+    return args
+
+
+@torch.no_grad()
+def inference_topdown(model, pose_pipeline, det_model, det_pipeline, folder):
+
+    img_paths = [f'{folder}/{img}' for img in os.listdir(folder)]
+
+    w, h = Image.open(img_paths[0]).size
+    bbox0 = np.array([[0, 0, w, h]], dtype=np.float32)
+
+    imgs = [cv2.imread(img_path) for img_path in img_paths]
+
+    data_list = [
+        dict(img=copy.deepcopy(img), img_id=idx)
+        for idx, img in enumerate(imgs)
+    ]
+    data_list = [det_pipeline(data_info) for data_info in data_list]
+    batch = pseudo_collate(data_list)
+    bbox_results = det_model.test_step(batch)
+    bboxes = [i.pred_instances.bboxes[:1].cpu().numpy() for i in bbox_results]
+    scores = []
+    for i in bbox_results:
+        try:
+            score = i.pred_instances.scores[0].item()
+        except Exception as ex:
+            print(ex)
+            score = 0
+        scores.append(score)
+    data_list = []
+    for img, bbox, score in zip(imgs, bboxes, scores):
+        data_info = dict(img=img)
+        if bbox.shape == bbox0.shape and score > 0.3:
+            if score > 0.5:
+                data_info['bbox'] = bbox
+            else:
+                w = (score - 0.1) / 0.4
+                data_info['bbox'] = w * bbox + (1 - w) * bbox0
+        else:
+            data_info['bbox'] = bbox0
+        data_info['bbox_score'] = np.ones(1, dtype=np.float32)  # shape (1,)
+        data_info.update(model.dataset_meta)
+        data_list.append(pose_pipeline(data_info))
+
+    batch = pseudo_collate(data_list)
+    results = model.test_step(batch)
+
+    lookup = {}
+    for img_path, result in zip(img_paths, results):
+        keypoints = result.pred_instances.keypoints
+        scores = result.pred_instances.keypoint_scores
+        lookup[img_path] = (keypoints, scores, (w, h))
+    return lookup
+
+
+def main():
+    args = parse_args()
+
+    det_model = init_detector(
+        args.det_config, args.det_ckpt, device=args.device)
+    det_model.cfg.test_dataloader.dataset.pipeline[
+        0].type = 'mmdet.LoadImageFromNDArray'
+    det_pipeline = Compose(det_model.cfg.test_dataloader.dataset.pipeline)
+
+    model = init_model(
+        args.pose_config, args.pose_checkpoint, device=args.device)
+    init_default_scope(model.cfg.get('default_scope', 'mmpose'))
+
+    folders = [f'{args.root}/{folder}' for folder in os.listdir(args.root)]
+
+    pose_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline)
+    # inference a single image
+    lookup = {}
+    L = len(folders)
+    t = time.time()
+    for idx, folder in enumerate(folders):
+        results = inference_topdown(model, pose_pipeline, det_model,
+                                    det_pipeline, folder)
+        lookup.update(results)
+        if idx % 100 == 99:
+            eta = (time.time() - t) / (idx + 1) * (L - idx) / 3600
+            print('Require %.2f hours' % eta)
+
+    with open('jester.pkl', 'wb') as f:
+        pickle.dump(lookup, f)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/gesture_recognition/parse_pose.py b/projects/gesture_recognition/parse_pose.py
new file mode 100644
index 0000000000..b8bbbe6490
--- /dev/null
+++ b/projects/gesture_recognition/parse_pose.py
@@ -0,0 +1,179 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import os
+
+import numpy as np
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert and merge hand pose dataset to COCO style')
+    parser.add_argument(
+        '--data_root',
+        type=str,
+        default='./data/',
+        help='the root to all involved datasets')
+    parser.add_argument(
+        '--out_anno_prefix',
+        type=str,
+        default='hand_det',
+        help='the prefix of output annotation files')
+
+    args = parser.parse_args()
+    return args
+
+
+def get_data_root(path):
+    path = path.split('/')
+    index = path.index('annotations') - 1
+    root = path[index]
+    if root == 'halpe':
+        root = 'halpe/hico_20160224_det/images/train2015/'
+    return root
+
+
+def parse_coco_style(file_path, anno_idx=0):
+    with open(file_path) as f:
+        contents = json.load(f)
+
+    data_root = get_data_root(file_path) + '/'
+    images = contents['images']
+    annos = contents['annotations']
+    images_out, annos_out = [], []
+    for img, anno in zip(images, annos):
+        assert img['id'] == anno['image_id']
+        img_out = dict(
+            file_name=data_root + img['file_name'],
+            height=img['height'],
+            width=img['width'],
+            id=anno_idx)
+        anno_out = dict(
+            area=anno['area'],
+            iscrowd=anno['iscrowd'],
+            image_id=anno_idx,
+            bbox=anno['bbox'],
+            category_id=0,
+            id=anno_idx)
+        anno_idx += 1
+        images_out.append(img_out)
+        annos_out.append(anno_out)
+    return images_out, annos_out, anno_idx
+
+
+def parse_halpe(file_path, anno_idx):
+
+    def get_bbox(keypoints):
+        """Get bbox from keypoints."""
+        if len(keypoints) == 0:
+            return [0, 0, 0, 0]
+        x1, y1, _ = np.amin(keypoints, axis=0)
+        x2, y2, _ = np.amax(keypoints, axis=0)
+        w, h = x2 - x1, y2 - y1
+        return [x1, y1, w, h]
+
+    with open(file_path) as f:
+        contents = json.load(f)
+
+    data_root = get_data_root(file_path) + '/'
+    images = contents['images']
+    annos = contents['annotations']
+    images_out, annos_out = [], []
+    for img, anno in zip(images, annos):
+        assert img['id'] == anno['image_id']
+        keypoints = np.array(anno['keypoints']).reshape(-1, 3)
+        lefthand_kpts = keypoints[-42:-21, :]
+        righthand_kpts = keypoints[-21:, :]
+
+        left_mask = lefthand_kpts[:, 2] > 0
+        right_mask = righthand_kpts[:, 2] > 0
+        lefthand_box = get_bbox(lefthand_kpts[left_mask])
+        righthand_box = get_bbox(righthand_kpts[right_mask])
+
+        if max(lefthand_box) > 0:
+            img_out = dict(
+                file_name=data_root + img['file_name'],
+                height=img['height'],
+                width=img['width'],
+                id=anno_idx)
+            anno_out = dict(
+                area=lefthand_box[2] * lefthand_box[3],
+                iscrowd=anno['iscrowd'],
+                image_id=anno_idx,
+                bbox=lefthand_box,
+                category_id=0,
+                id=anno_idx)
+            anno_idx += 1
+            images_out.append(img_out)
+            annos_out.append(anno_out)
+
+        if max(righthand_box) > 0:
+            img_out = dict(
+                file_name=data_root + img['file_name'],
+                height=img['height'],
+                width=img['width'],
+                id=anno_idx)
+            anno_out = dict(
+                area=righthand_box[2] * righthand_box[3],
+                iscrowd=anno['iscrowd'],
+                image_id=anno_idx,
+                bbox=righthand_box,
+                category_id=0,
+                id=anno_idx)
+            anno_idx += 1
+            images_out.append(img_out)
+            annos_out.append(anno_out)
+    return images_out, annos_out, anno_idx
+
+
+train_files = [
+    'freihand/annotations/freihand_train.json',
+    'halpe/annotations/halpe_train_v1.json',
+    'onehand10k/annotations/onehand10k_train.json',
+    '/rhd/annotations/rhd_train.json'
+]
+
+val_files = ['onehand10k/annotations/onehand10k_test.json']
+
+
+def convert2dict(data_root, anno_files):
+    anno_files = [data_root + _ for _ in anno_files]
+
+    images, annos, anno_idx = [], [], 0
+    for anno_file in anno_files:
+        if 'freihand' in anno_file or 'onehand10k' in anno_file \
+                                   or 'rhd' in anno_file:
+            images_out, annos_out, anno_idx = parse_coco_style(
+                anno_file, anno_idx)
+            images += images_out
+            annos += annos_out
+        elif 'halpe' in anno_file:
+            images_out, annos_out, anno_idx = parse_halpe(anno_file, anno_idx)
+            images += images_out
+            annos += annos_out
+        else:
+            print(f'{anno_file} not supported')
+
+    result = dict(
+        images=images,
+        annotations=annos,
+        categories=[{
+            'id': 0,
+            'name': 'hand'
+        }])
+    return result
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    data_root = args.data_root + '/'
+    prefix = args.out_anno_prefix
+    os.makedirs('hand_det', exist_ok=True)
+
+    result = convert2dict(data_root, train_files)
+    with open(f'hand_det/{prefix}_train.json', 'w') as f:
+        json.dump(result, f)
+
+    result = convert2dict(data_root, val_files)
+    with open(f'hand_det/{prefix}_val.json', 'w') as f:
+        json.dump(result, f)
diff --git a/projects/knowledge_distillation/README.md b/projects/knowledge_distillation/README.md
new file mode 100644
index 0000000000..8ce23b09b6
--- /dev/null
+++ b/projects/knowledge_distillation/README.md
@@ -0,0 +1,132 @@
+# Knowledge Distillation Based on MMRazor
+
+Knowledge Distillation is a classic model compression method. The core idea is to "imitate" a teacher model (or multi-model ensemble) with better performance and more complex structure by guiding a lightweight student model, improving the performance of the student model without changing its structure. [MMRazor](https://github.com/open-mmlab/mmrazor) is a model compression toolkit for model slimming and AutoML, which supports several KD algorithms. In this project, we take TSM-MobileNetV2 as an example to show how to use MMRazor to perform knowledge distillation on action recognition models. You could refer to more [MMRazor](https://github.com/open-mmlab/mmrazor) for more model compression algorithms.
+
+## Description
+
+This is an implementation of MMRazor Knowledge Distillation Application, we provide action recognition configs and models for MMRazor.
+
+## Usage
+
+### Prerequisites
+
+- [MMRazor v1.0.0](https://github.com/open-mmlab/mmrazor/tree/v1.0.0) or higher
+
+There are two install modes:
+
+Option (a). Install as a Python package
+
+```shell
+mim install "mmrazor>=1.0.0"
+```
+
+Option (b). Install from source
+
+```shell
+git clone https://github.com/open-mmlab/mmrazor.git
+cd mmrazor
+pip install -v -e .
+```
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+### Data Preparation
+
+Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/kinetics/README.md).
+
+Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link.
+
+```shell
+ln -s ../../data ./data
+```
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmrazor configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmrazor configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmrazor configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+Please convert the knowledge distillation checkpoint to student-only checkpoint with following commands, you will get a checkpoint with a '\_student.pth' suffix under the same directory as the original checkpoint. Then take the student-only checkpoint for testing.
+
+```bash
+mim run mmrazor convert_kd_ckpt_to_student $CHECKPOINT
+```
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results and models
+
+| Location |   Dataset    |    Teacher     |      Student      |     Acc     | Acc(T) | Acc(S) |        Config         | Download                                                                      |
+| :------: | :----------: | :------------: | :---------------: | :---------: | :----: | :----: | :-------------------: | :---------------------------------------------------------------------------- |
+|  logits  | Kinetics-400 | [TSM-ResNet50] | [TSM-MobileNetV2] | 69.60(+0.9) | 73.22  | 68.71  | [config][distill_tsm] | [teacher][tsm_r50_pth] \| [model][distill_pth_tsm] \| [log][distill_log_tsm]  |
+|  logits  | Kinetics-400 |   [TSN-Swin]   |  [TSN-ResNet50]   | 75.54(+1.4) | 79.22  | 74.12  | [config][distill_tsn] | [teacher][tsn_swin_pth] \| [model][distill_pth_tsn] \| [log][distill_log_tsn] |
+
+## Citation
+
+```latex
+@article{huang2022knowledge,
+  title={Knowledge Distillation from A Stronger Teacher},
+  author={Huang, Tao and You, Shan and Wang, Fei and Qian, Chen and Xu, Chang},
+  journal={arXiv preprint arXiv:2205.10536},
+  year={2022}
+}
+```
+
+[distill_log_tsm]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.log
+[distill_log_tsn]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsn-swin_tsn-r50_1x1x8_k400/kd_logits_tsn-swin_tsn-r50_1x1x8_k400.log
+[distill_pth_tsm]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400_20230517-c3e8aa0d.pth
+[distill_pth_tsn]: https://download.openmmlab.com/mmaction/v1.0/projects/knowledge_distillation/kd_logits_tsn-swin_tsn-r50_1x1x8_k400/kd_logits_tsn-swin_tsn-r50_1x1x8_k400_student_20230530-f938d404.pth
+[distill_tsm]: configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py
+[distill_tsn]: configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py
+[tsm-mobilenetv2]: ../../configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py
+[tsm-resnet50]: ../../configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py
+[tsm_r50_pth]: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-100e_kinetics400-rgb_20220831-a6db1e5d.pth
+[tsn-resnet50]: ../../configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py
+[tsn-swin]: ../../configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py
+[tsn_swin_pth]: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb_20230530-428f0064.pth
diff --git a/projects/knowledge_distillation/configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py b/projects/knowledge_distillation/configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py
new file mode 100644
index 0000000000..dc1f536860
--- /dev/null
+++ b/projects/knowledge_distillation/configs/kd_logits_tsm-res50_tsm-mobilenetv2_8xb16_k400.py
@@ -0,0 +1,36 @@
+_base_ = 'mmaction::recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py'  # noqa: E501
+
+teacher_ckpt = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth'  # noqa: E501
+model = dict(
+    _delete_=True,
+    _scope_='mmrazor',
+    type='SingleTeacherDistill',
+    architecture=dict(
+        cfg_path=  # noqa: E251
+        'mmaction::recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py',  # noqa: E501
+        pretrained=False),
+    teacher=dict(
+        cfg_path=  # noqa: E251
+        'mmaction::recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py',  # noqa: E501
+        pretrained=False),
+    teacher_ckpt=teacher_ckpt,
+    distiller=dict(
+        type='ConfigurableDistiller',
+        student_recorders=dict(
+            logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')),
+        teacher_recorders=dict(
+            logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')),
+        distill_losses=dict(
+            loss_dist=dict(
+                type='DISTLoss',
+                inter_loss_weight=1.0,
+                intra_loss_weight=1.0,
+                tau=1,
+                loss_weight=1,
+            )),
+        loss_forward_mappings=dict(
+            loss_dist=dict(
+                logits_S=dict(from_student=True, recorder='logits'),
+                logits_T=dict(from_student=False, recorder='logits')))))
+
+val_cfg = dict(_delete_=True, type='mmrazor.SingleTeacherDistillValLoop')
diff --git a/projects/knowledge_distillation/configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py b/projects/knowledge_distillation/configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py
new file mode 100644
index 0000000000..3a5ea1dada
--- /dev/null
+++ b/projects/knowledge_distillation/configs/kd_logits_tsn-swin_tsn-r50_8xb16_k400.py
@@ -0,0 +1,38 @@
+_base_ = 'mmaction::recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py'  # noqa: E501
+
+teacher_ckpt = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb_20230530-428f0064.pth'  # noqa: E501
+
+model = dict(
+    _delete_=True,
+    _scope_='mmrazor',
+    type='SingleTeacherDistill',
+    architecture=dict(
+        cfg_path=  # noqa: E251
+        'mmaction::recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py',  # noqa: E501
+        backbone=dict(pretrained=False),
+        pretrained=False),
+    teacher=dict(
+        cfg_path=  # noqa: E251
+        'mmaction::recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_32xb8-1x1x8-50e_kinetics400-rgb.py',  # noqa: E501
+        pretrained=False),
+    teacher_ckpt=teacher_ckpt,
+    distiller=dict(
+        type='ConfigurableDistiller',
+        student_recorders=dict(
+            logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')),
+        teacher_recorders=dict(
+            logits=dict(type='ModuleOutputs', source='cls_head.fc_cls')),
+        distill_losses=dict(
+            loss_dist=dict(
+                type='DISTLoss',
+                inter_loss_weight=1.0,
+                intra_loss_weight=1.0,
+                tau=1,
+                loss_weight=4,
+            )),
+        loss_forward_mappings=dict(
+            loss_dist=dict(
+                logits_S=dict(from_student=True, recorder='logits'),
+                logits_T=dict(from_student=False, recorder='logits')))))
+
+val_cfg = dict(_delete_=True, type='mmrazor.SingleTeacherDistillValLoop')
diff --git a/projects/msg3d/README.md b/projects/msg3d/README.md
index 637eb97520..cbd72b7ace 100644
--- a/projects/msg3d/README.md
+++ b/projects/msg3d/README.md
@@ -20,9 +20,11 @@ Spatial-temporal graphs have been widely used by skeleton-based action recogniti
 
 ### Setup Environment
 
-Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
+Please refer to [Installation](https://mmaction2.readthedocs.io/en/latest/get_started/installation.html) to install MMAction2.
 
-At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+Assume that you are located at `$MMACTION2/projects/msg3d`.
+
+Add the current folder to `PYTHONPATH`, so that Python can find your code. Run the following command in the current directory to add it.
 
 > Please run it every time after you opened a new shell.
 
@@ -34,6 +36,16 @@ export PYTHONPATH=`pwd`:$PYTHONPATH
 
 Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/skeleton/README.md).
 
+Create a symbolic link from `$MMACTION2/data` to `./data` in the current directory, so that Python can locate your data. Run the following command in the current directory to create the symbolic link.
+
+```shell
+ln -s ../../data ./data
+```
+
+### Data Preparation
+
+Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md).
+
 ### Training commands
 
 **To train with single GPU:**
@@ -103,41 +115,3 @@ mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py -
   year={2020}
 }
 ```
-
-## Checklist
-
-Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects.
-
-- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
-
-  - [x] Finish the code
-
-    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmaction.registry.MODELS` and configurable via a config file. -->
-
-  - [x] Basic docstrings & proper citation
-
-    <!-- Each major class should contains a docstring, describing its functionality and arguments. If your code is copied or modified from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
-
-  - [x] Converted checkpoint and results (Only for reproduction)
-
-    <!-- If you are reproducing the result from a paper, make sure the model in the project can match that results. Also please provide checkpoint links or a checkpoint conversion script for others to get the pre-trained model. -->
-
-- [x] Milestone 2: Indicates a successful model implementation.
-
-  - [x] Training results
-
-    <!-- If you are reproducing the result from a paper, train your model from scratch and verified that the final result can match the original result. Usually, ±0.1% is acceptable for the action recognition task on Kinetics400. -->
-
-- [ ] Milestone 3: Good to be a part of our core package!
-
-  - [ ] Unit tests
-
-    <!-- Unit tests for the major module are required. [Example](https://github.com/open-mmlab/mmaction2/blob/main/tests/models/backbones/test_resnet.py) -->
-
-  - [ ] Code style
-
-    <!-- Refactor your code according to reviewer's comment. -->
-
-  - [ ] `metafile.yml` and `README.md`
-
-    <!-- It will used for MMAction2 to acquire your models. [Example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/metafile.yml). In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/README.md) -->
diff --git a/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py
new file mode 100644
index 0000000000..d8bc034187
--- /dev/null
+++ b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+model = dict(
+    type='FasterRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0.0, 0.0, 0.0, 0.0],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                type='RandomChoiceResize',
+                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                        (1333, 768), (1333, 800)],
+                keep_ratio=True),
+            dict(type='RandomFlip', prob=0.5),
+            dict(type='PackDetInputs')
+        ],
+        backend_args=None))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ],
+        backend_args=None))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ],
+        backend_args=None))
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=None)
+test_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=None)
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+default_scope = 'mmdet'
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='DetVisualizationHook'))
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='DetLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend')],
+    name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py
new file mode 100644
index 0000000000..2c8f76311d
--- /dev/null
+++ b/projects/stad_tutorial/configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'
+model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))
+
+# take 2 epochs as an example
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)
+
+# learning rate
+param_scheduler = [
+    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.0050, momentum=0.9, weight_decay=0.0001))
+
+dataset_type = 'CocoDataset'
+# modify metainfo
+metainfo = {
+    'classes': ('person', ),
+    'palette': [
+        (220, 20, 60),
+    ]
+}
+
+# specify metainfo, dataset path
+data_root = 'data/multisports/'
+
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/multisports_det_anno_train.json',
+        data_prefix=dict(img='rawframes/'),
+        metainfo=metainfo))
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/multisports_det_anno_val.json',
+        data_prefix=dict(img='rawframes/'),
+        metainfo=metainfo))
+
+test_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/ms_infer_anno.json',
+        data_prefix=dict(img='rawframes/'),
+        metainfo=metainfo))
+
+# specify annotaition file path, modify metric items
+val_evaluator = dict(
+    ann_file='data/multisports/annotations/multisports_det_anno_val.json',
+    metric_items=['mAP_50', 'AR@100'],
+    iou_thrs=[0.5],
+)
+
+test_evaluator = dict(
+    ann_file='data/multisports/annotations/ms_infer_anno.json',
+    metric_items=['mAP_50', 'AR@100'],
+    iou_thrs=[0.5],
+)
+
+# specify pretrain checkpoint
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
diff --git a/projects/stad_tutorial/configs/slowonly_k400_multisports.py b/projects/stad_tutorial/configs/slowonly_k400_multisports.py
new file mode 100644
index 0000000000..d8eb4176df
--- /dev/null
+++ b/projects/stad_tutorial/configs/slowonly_k400_multisports.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = [
+    'mmaction::detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py'  # noqa: E501
+]
+
+proposal_file_train = 'data/multisports/annotations/multisports_proposals_train.pkl'  # noqa: E501
+proposal_file_val = 'data/multisports/annotations/multisports_proposals_val.pkl'  # noqa: E501
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    dataset=dict(proposal_file=proposal_file_train))
+
+val_dataloader = dict(
+    num_workers=2, dataset=dict(proposal_file=proposal_file_val))
+
+optim_wrapper = dict(optimizer=dict(type='SGD', lr=0.01))
+
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth'  # noqa: E501
diff --git a/projects/stad_tutorial/demo_stad.ipynb b/projects/stad_tutorial/demo_stad.ipynb
new file mode 100644
index 0000000000..4fc14532e5
--- /dev/null
+++ b/projects/stad_tutorial/demo_stad.ipynb
@@ -0,0 +1,4096 @@
+{
+  "cells": [
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MxFBtHQ4ooZh"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/open-mmlab/mmaction2/projects/stad_tutorial/demo_stad.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ff6iCPqqooZp"
+      },
+      "source": [
+        "# Spatio-temporal action detection with MMAction2\n",
+        "Welcome to MMAction2! This is a tutorial on how to use MMAction2 for spatio-temporal action detection. In this tutorial, we will use the MultiSports dataset as an example, and provide a complete step-by-step guide for spatio-temporal action detection, including\n",
+        "- Prepare spatio-temporal action detection dataset\n",
+        "- Train detection model\n",
+        "- Prepare AVA format dataset\n",
+        "- Train spatio-temporal action detection model\n"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xQlffdn7ooZq"
+      },
+      "source": [
+        "## 0. Install MMAction2 and MMDetection"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "4vWjBJI-ooZr",
+        "outputId": "1c852c24-eb40-407d-e1c4-72d4b43385a3"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting openmim\n",
+            "  Downloading openmim-0.3.7-py2.py3-none-any.whl (51 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.3/51.3 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: Click in /usr/local/lib/python3.10/dist-packages (from openmim) (8.1.3)\n",
+            "Collecting colorama (from openmim)\n",
+            "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+            "Collecting model-index (from openmim)\n",
+            "  Downloading model_index-0.1.11-py3-none-any.whl (34 kB)\n",
+            "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from openmim) (1.5.3)\n",
+            "Requirement already satisfied: pip>=19.3 in /usr/local/lib/python3.10/dist-packages (from openmim) (23.1.2)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from openmim) (2.27.1)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from openmim) (13.3.4)\n",
+            "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from openmim) (0.8.10)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (6.0)\n",
+            "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (3.4.3)\n",
+            "Collecting ordered-set (from model-index->openmim)\n",
+            "  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n",
+            "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2.8.2)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2022.7.1)\n",
+            "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (1.22.4)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (1.26.15)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2022.12.7)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2.0.12)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (3.4)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.14.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->openmim) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->openmim) (1.16.0)\n",
+            "Installing collected packages: ordered-set, colorama, model-index, openmim\n",
+            "Successfully installed colorama-0.4.6 model-index-0.1.11 openmim-0.3.7 ordered-set-4.1.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmengine\n",
+            "  Downloading mmengine-0.7.4-py3-none-any.whl (374 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m374.3/374.3 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting addict (from mmengine)\n",
+            "  Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmengine) (1.22.4)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmengine) (6.0)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine) (2.3.0)\n",
+            "Collecting yapf (from mmengine)\n",
+            "  Downloading yapf-0.40.0-py3-none-any.whl (250 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.3/250.3 kB\u001b[0m \u001b[31m28.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmengine) (4.7.0.72)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.4.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (23.1)\n",
+            "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (8.4.0)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.14.0)\n",
+            "Collecting importlib-metadata>=6.6.0 (from yapf->mmengine)\n",
+            "  Downloading importlib_metadata-6.6.0-py3-none-any.whl (22 kB)\n",
+            "Collecting platformdirs>=3.5.1 (from yapf->mmengine)\n",
+            "  Downloading platformdirs-3.5.3-py3-none-any.whl (15 kB)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmengine) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmengine) (3.15.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine) (1.16.0)\n",
+            "Installing collected packages: addict, platformdirs, importlib-metadata, yapf, mmengine\n",
+            "  Attempting uninstall: platformdirs\n",
+            "    Found existing installation: platformdirs 3.3.0\n",
+            "    Uninstalling platformdirs-3.3.0:\n",
+            "      Successfully uninstalled platformdirs-3.3.0\n",
+            "Successfully installed addict-2.4.0 importlib-metadata-6.6.0 mmengine-0.7.4 platformdirs-3.5.3 yapf-0.40.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmcv\n",
+            "  Downloading https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl (74.4 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.4/74.4 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv) (2.4.0)\n",
+            "Requirement already satisfied: mmengine>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.7.4)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv) (1.22.4)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv) (23.1)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv) (8.4.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv) (6.0)\n",
+            "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.40.0)\n",
+            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv) (4.7.0.72)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (3.7.1)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (2.3.0)\n",
+            "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (6.6.0)\n",
+            "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (3.5.3)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv) (3.15.0)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.4.4)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.14.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine>=0.2.0->mmcv) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine>=0.2.0->mmcv) (1.16.0)\n",
+            "Installing collected packages: mmcv\n",
+            "Successfully installed mmcv-2.0.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmdet\n",
+            "  Downloading mmdet-3.0.0-py3-none-any.whl (1.7 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmdet) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.22.4)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.6)\n",
+            "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.10.1)\n",
+            "Requirement already satisfied: shapely in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.1)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.16.0)\n",
+            "Collecting terminaltables (from mmdet)\n",
+            "  Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)\n",
+            "Requirement already satisfied: mmcv<2.1.0,>=2.0.0rc4 in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.0)\n",
+            "Requirement already satisfied: mmengine<1.0.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from mmdet) (0.7.4)\n",
+            "Requirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.4.0)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (23.1)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (8.4.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.0)\n",
+            "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (0.40.0)\n",
+            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (4.7.0.72)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (2.3.0)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.4.4)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.14.0)\n",
+            "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.6.0)\n",
+            "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.5.3)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.15.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine<1.0.0,>=0.7.1->mmdet) (0.1.2)\n",
+            "Installing collected packages: terminaltables, mmdet\n",
+            "Successfully installed mmdet-3.0.0 terminaltables-3.1.10\n",
+            "Cloning into 'mmaction2'...\n",
+            "remote: Enumerating objects: 22869, done.\u001b[K\n",
+            "remote: Counting objects: 100% (1491/1491), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (800/800), done.\u001b[K\n",
+            "remote: Total 22869 (delta 855), reused 1176 (delta 686), pack-reused 21378\u001b[K\n",
+            "Receiving objects: 100% (22869/22869), 82.81 MiB | 15.42 MiB/s, done.\n",
+            "Resolving deltas: 100% (15954/15954), done.\n",
+            "/content/mmaction2\n",
+            "Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Obtaining file:///content/mmaction2\n",
+            "  Running command python setup.py egg_info\n",
+            "  running egg_info\n",
+            "  creating /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info\n",
+            "  writing /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/PKG-INFO\n",
+            "  writing dependency_links to /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/dependency_links.txt\n",
+            "  writing requirements to /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/requires.txt\n",
+            "  writing top-level names to /tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/top_level.txt\n",
+            "  writing manifest file '/tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/SOURCES.txt'\n",
+            "  reading manifest file '/tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/SOURCES.txt'\n",
+            "  reading manifest template 'MANIFEST.in'\n",
+            "  warning: no files found matching 'mmaction/.mim/model-index.yml'\n",
+            "  warning: no files found matching '*.py' under directory 'mmaction/.mim/configs'\n",
+            "  warning: no files found matching '*.yml' under directory 'mmaction/.mim/configs'\n",
+            "  warning: no files found matching '*.sh' under directory 'mmaction/.mim/tools'\n",
+            "  warning: no files found matching '*.py' under directory 'mmaction/.mim/tools'\n",
+            "  adding license file 'LICENSE'\n",
+            "  writing manifest file '/tmp/pip-pip-egg-info-d1y_zlo6/mmaction2.egg-info/SOURCES.txt'\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting decord>=0.4.1 (from mmaction2==1.0.0)\n",
+            "  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.6/13.6 MB\u001b[0m \u001b[31m71.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting einops (from mmaction2==1.0.0)\n",
+            "  Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.22.4)\n",
+            "Requirement already satisfied: opencv-contrib-python in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (4.7.0.72)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (8.4.0)\n",
+            "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.10.1)\n",
+            "Requirement already satisfied: torch>=1.3 in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (2.0.1+cu118)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.12.0)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (4.5.0)\n",
+            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (1.11.1)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1)\n",
+            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1.2)\n",
+            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (2.0.0)\n",
+            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (3.25.2)\n",
+            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (16.0.5)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.4.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (23.1)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (2.8.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmaction2==1.0.0) (1.16.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.3->mmaction2==1.0.0) (2.1.2)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.3->mmaction2==1.0.0) (1.3.0)\n",
+            "Installing collected packages: einops, decord, mmaction2\n",
+            "  Running setup.py develop for mmaction2\n",
+            "    Running command python setup.py develop\n",
+            "    running develop\n",
+            "    /usr/local/lib/python3.10/dist-packages/setuptools/command/develop.py:40: EasyInstallDeprecationWarning: easy_install command is deprecated.\n",
+            "    !!\n",
+            "\n",
+            "            ********************************************************************************\n",
+            "            Please avoid running ``setup.py`` and ``easy_install``.\n",
+            "            Instead, use pypa/build, pypa/installer, pypa/build or\n",
+            "            other standards-based tools.\n",
+            "\n",
+            "            See https://github.com/pypa/setuptools/issues/917 for details.\n",
+            "            ********************************************************************************\n",
+            "\n",
+            "    !!\n",
+            "      easy_install.initialize_options(self)\n",
+            "    /usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated.\n",
+            "    !!\n",
+            "\n",
+            "            ********************************************************************************\n",
+            "            Please avoid running ``setup.py`` directly.\n",
+            "            Instead, use pypa/build, pypa/installer, pypa/build or\n",
+            "            other standards-based tools.\n",
+            "\n",
+            "            See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.\n",
+            "            ********************************************************************************\n",
+            "\n",
+            "    !!\n",
+            "      self.initialize_options()\n",
+            "    running egg_info\n",
+            "    creating mmaction2.egg-info\n",
+            "    writing mmaction2.egg-info/PKG-INFO\n",
+            "    writing dependency_links to mmaction2.egg-info/dependency_links.txt\n",
+            "    writing requirements to mmaction2.egg-info/requires.txt\n",
+            "    writing top-level names to mmaction2.egg-info/top_level.txt\n",
+            "    writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    reading manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    reading manifest template 'MANIFEST.in'\n",
+            "    adding license file 'LICENSE'\n",
+            "    writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    running build_ext\n",
+            "    Creating /usr/local/lib/python3.10/dist-packages/mmaction2.egg-link (link to .)\n",
+            "    Adding mmaction2 1.0.0 to easy-install.pth file\n",
+            "\n",
+            "    Installed /content/mmaction2\n",
+            "Successfully installed decord-0.6.0 einops-0.6.1 mmaction2-1.0.0\n",
+            "/content/mmaction2/projects/stad_tutorial\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install -U openmim\n",
+        "!mim install mmengine\n",
+        "!mim install mmcv\n",
+        "!mim install mmdet\n",
+        "\n",
+        "!git clone https://github.com/open-mmlab/mmaction2.git\n",
+        "\n",
+        "%cd mmaction2\n",
+        "%pip install -v -e .\n",
+        "%cd projects/stad_tutorial"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ox0TM64FooZt"
+      },
+      "source": [
+        "## 1. Prepare spatio-temporal action detection dataset\n",
+        "\n",
+        "Similar to detection tasks that require bounding box annotations, spatio-temporal action detection tasks require temporal and spatial localization, so more complex tube annotations are required. Taking the MultiSports dataset as an example, the `gttubes` field provides all the target action annotations in the video, and the following is an annotation fragment:\n",
+        "\n",
+        "```\n",
+        "    'gttubes': {\n",
+        "        'aerobic_gymnastics/v_aqMgwPExjD0_c001': # video_key\n",
+        "            {\n",
+        "                10: # label index\n",
+        "                    [\n",
+        "                        array([[ 377.,  904.,  316., 1016.,  584.], # 1st tube of class 10\n",
+        "                               [ 378.,  882.,  315., 1016.,  579.], # shape (n, 5): n frames，each annotation includes (frame idx，x1，y1, x2, y2)\n",
+        "                               ...\n",
+        "                               [ 398.,  861.,  304.,  954.,  549.]], dtype=float32)，\n",
+        "\n",
+        "                        array([[ 399.,  881.,  308.,  955.,  542.], # 2nd tube of class 10\n",
+        "                               [ 400.,  862.,  303.,  988.,  539.],\n",
+        "                               [ 401.,  853.,  292., 1000.,  535.],\n",
+        "                               ...])\n",
+        "                        ...\n",
+        "\n",
+        "                    ] ,\n",
+        "                9: # label index\n",
+        "                    [\n",
+        "                        array(...), # 1st tube of class 9\n",
+        "                        array(...), # 2nd tube of class 9\n",
+        "                        ...\n",
+        "                    ]\n",
+        "                ...\n",
+        "            }\n",
+        "    }\n",
+        "```\n",
+        "\n",
+        "The annotation file also needs to provide other field information, and the complete ground truth file includes the following information:\n",
+        "\n",
+        "```\n",
+        "{\n",
+        "    'labels':  # label list\n",
+        "        ['aerobic push up', 'aerobic explosive push up', ...],\n",
+        "    'train_videos':  # training video list\n",
+        "        [\n",
+        "            [\n",
+        "                'aerobic_gymnastics/v_aqMgwPExjD0_c001',\n",
+        "                'aerobic_gymnastics/v_yaKOumdXwbU_c019',\n",
+        "                ...\n",
+        "            ]\n",
+        "        ]\n",
+        "    'test_videos':  # test video list\n",
+        "        [\n",
+        "            [\n",
+        "                'aerobic_gymnastics/v_crsi07chcV8_c004',\n",
+        "                'aerobic_gymnastics/v_dFYr67eNMwA_c005',\n",
+        "                ...\n",
+        "            ]\n",
+        "        ]\n",
+        "    'n_frames':  # dict provides frame number of each video\n",
+        "        {\n",
+        "            'aerobic_gymnastics/v_crsi07chcV8_c004': 725,\n",
+        "            'aerobic_gymnastics/v_dFYr67eNMwA_c005': 750,\n",
+        "            ...\n",
+        "        }\n",
+        "    'resolution':  # dict provides resolution of each video\n",
+        "        {\n",
+        "            'aerobic_gymnastics/v_crsi07chcV8_c004': (720, 1280),\n",
+        "            'aerobic_gymnastics/v_dFYr67eNMwA_c005': (720, 1280),\n",
+        "            ...\n",
+        "        }\n",
+        "    'gt_tubes':  # dict provides bouding boxes of each tube\n",
+        "        {\n",
+        "            ... # refer to above description\n",
+        "        }\n",
+        "}\n",
+        "```\n",
+        "\n",
+        "The subsequent experiments are based on MultiSports-tiny, we extracted a small number of videos from MultiSports for demonstration purposes."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "n5AzsRvdooZv",
+        "outputId": "a6cad83b-4613-43cc-8c09-86ac79242656"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "--2023-06-15 06:00:15--  https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n",
+            "Resolving download.openmmlab.com (download.openmmlab.com)... 163.181.82.215, 163.181.82.216, 163.181.82.218, ...\n",
+            "Connecting to download.openmmlab.com (download.openmmlab.com)|163.181.82.215|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 82780160 (79M) [application/x-tar]\n",
+            "Saving to: ‘data/multisports-tiny.tar’\n",
+            "\n",
+            "multisports-tiny.ta 100%[===================>]  78.95M  13.3MB/s    in 44s     \n",
+            "\n",
+            "2023-06-15 06:01:00 (1.78 MB/s) - ‘data/multisports-tiny.tar’ saved [82780160/82780160]\n",
+            "\n",
+            "multisports-tiny/multisports/\n",
+            "multisports-tiny/multisports/test/\n",
+            "multisports-tiny/multisports/test/aerobic_gymnastics/\n",
+            "multisports-tiny/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4\n",
+            "multisports-tiny/multisports/annotations/\n",
+            "multisports-tiny/multisports/annotations/multisports_GT.pkl\n",
+            "multisports-tiny/multisports/trainval/\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c001.mp4\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c003.mp4\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c002.mp4\n",
+            "Reading package lists...\n",
+            "Building dependency tree...\n",
+            "Reading state information...\n",
+            "The following NEW packages will be installed:\n",
+            "  tree\n",
+            "0 upgraded, 1 newly installed, 0 to remove and 46 not upgraded.\n",
+            "Need to get 43.0 kB of archives.\n",
+            "After this operation, 115 kB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tree amd64 1.8.0-1 [43.0 kB]\n",
+            "Fetched 43.0 kB in 1s (43.0 kB/s)\n",
+            "Selecting previously unselected package tree.\n",
+            "(Reading database ... 122541 files and directories currently installed.)\n",
+            "Preparing to unpack .../tree_1.8.0-1_amd64.deb ...\n",
+            "Unpacking tree (1.8.0-1) ...\n",
+            "Setting up tree (1.8.0-1) ...\n",
+            "Processing triggers for man-db (2.9.1-1) ...\n",
+            "\u001b[01;34mdata\u001b[00m\n",
+            "├── \u001b[01;34mmultisports\u001b[00m\n",
+            "│   ├── \u001b[01;34mannotations\u001b[00m\n",
+            "│   │   └── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "│   ├── \u001b[01;34mtest\u001b[00m\n",
+            "│   │   └── \u001b[01;34maerobic_gymnastics\u001b[00m\n",
+            "│   │       └── \u001b[01;32mv_7G_IpU0FxLU_c001.mp4\u001b[00m\n",
+            "│   └── \u001b[01;34mtrainval\u001b[00m\n",
+            "│       └── \u001b[01;34maerobic_gymnastics\u001b[00m\n",
+            "│           ├── \u001b[01;32mv__wAgwttPYaQ_c001.mp4\u001b[00m\n",
+            "│           ├── \u001b[01;32mv__wAgwttPYaQ_c002.mp4\u001b[00m\n",
+            "│           └── \u001b[01;32mv__wAgwttPYaQ_c003.mp4\u001b[00m\n",
+            "└── \u001b[01;31mmultisports-tiny.tar\u001b[00m\n",
+            "\n",
+            "6 directories, 6 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Download dataset\n",
+        "!wget -P data -c https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n",
+        "!tar -xvf data/multisports-tiny.tar --strip 1 -C data\n",
+        "!apt-get -q install tree\n",
+        "!tree data"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_u69LHscooZw"
+      },
+      "source": [
+        "## 2. Train detection model\n",
+        "\n",
+        "In the SlowOnly + Det paradigm, we need to train a human detector first, and then predict actions based on the detection results. In this section, we train a detection model based on the annotation format in the previous section and the MMDetection algorithm library.\n",
+        "\n",
+        "### 2.1 Build detection dataset annotation (COCO format)\n",
+        "\n",
+        "Based on the annotation information of the spatio-temporal action detection dataset, we can build a COCO format detection dataset for training the detection model. We provide a script to convert the MultiSports format annotation, if you need to convert from other formats, you can refer to the [custom dataset](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/customize_dataset.html) document provided by MMDetection."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "e8fu9VtRooZw",
+        "outputId": "3e7a7053-a08d-4c32-9d66-a362b3de164d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\u001b[01;34mdata/multisports/annotations\u001b[00m\n",
+            "├── multisports_det_anno_train.json\n",
+            "├── multisports_det_anno_val.json\n",
+            "└── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "\n",
+            "0 directories, 3 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python tools/generate_mmdet_anno.py data/multisports/annotations/multisports_GT.pkl data/multisports/annotations/multisports_det_anno.json\n",
+        "!tree data/multisports/annotations"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "HJAb8EwwooZx",
+        "outputId": "1c82387c-c731-484c-a4cc-8c255b3f2e62"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Will generate 3 rgb dir for aerobic_gymnastics.\n",
+            "Generate v__wAgwttPYaQ_c003 rgb dir successfully.\n",
+            "Generate v__wAgwttPYaQ_c002 rgb dir successfully.\n",
+            "Generate v__wAgwttPYaQ_c001 rgb dir successfully.\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python tools/generate_rgb.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9xIOk_XkooZx"
+      },
+      "source": [
+        "### 2.2 Modify config file\n",
+        "\n",
+        "We use faster-rcnn_x101-64x4d_fpn_1x_coco as the base configuration, and make the following modifications to train on the MultiSports dataset. The following parts need to be modified:\n",
+        "- Number of model categories\n",
+        "- Learning rate adjustment strategy\n",
+        "- Optimizer configuration\n",
+        "- Dataset/annotation file path\n",
+        "- Evaluator configuration\n",
+        "- Pre-trained model\n",
+        "\n",
+        "For more detailed tutorials, please refer to the [prepare configuration file](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/train.html#id9) document provided by MMDetection."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Ad1QLNM8ooZy",
+        "outputId": "55f95e91-8fdf-40fa-dd08-5fa980444b6f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "# Copyright (c) OpenMMLab. All rights reserved.\n",
+            "_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'\n",
+            "model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))\n",
+            "\n",
+            "# take 2 epochs as an example\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "\n",
+            "# learning rate\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "\n",
+            "# optimizer\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.0050, momentum=0.9, weight_decay=0.0001))\n",
+            "\n",
+            "dataset_type = 'CocoDataset'\n",
+            "# modify metainfo\n",
+            "metainfo = {\n",
+            "    'classes': ('person', ),\n",
+            "    'palette': [\n",
+            "        (220, 20, 60),\n",
+            "    ]\n",
+            "}\n",
+            "\n",
+            "# specify metainfo, dataset path\n",
+            "data_root = 'data/multisports/'\n",
+            "\n",
+            "train_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "val_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "test_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "# specify annotaition file path, modify metric items\n",
+            "val_evaluator = dict(\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5],\n",
+            ")\n",
+            "\n",
+            "test_evaluator = dict(\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5],\n",
+            ")\n",
+            "\n",
+            "# specify pretrain checkpoint\n",
+            "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501\n"
+          ]
+        }
+      ],
+      "source": [
+        "!cat configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "W40JO80nooZ0"
+      },
+      "source": [
+        "### 2.3 Train detection model"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Oc1LWr4AooZ0"
+      },
+      "source": [
+        "By using MIM, you can directly train MMDetection models in the current directory. Here is the simplest example of training on a single GPU. For more training commands, please refer to the MIM [tutorial](https://github.com/open-mmlab/mim#command)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "QpxCbvr2ooZ0",
+        "outputId": "ffe7b420-c359-4e5a-a1b1-3a75e923046d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/train.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py --launcher none --work-dir work_dirs/det_model. \n",
+            "06/15 06:02:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 503128501\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 503128501\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:02:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "model = dict(\n",
+            "    type='FasterRCNN',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[103.53, 116.28, 123.675],\n",
+            "        std=[1.0, 1.0, 1.0],\n",
+            "        bgr_to_rgb=False,\n",
+            "        pad_size_divisor=32),\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=False),\n",
+            "        norm_eval=True,\n",
+            "        style='caffe',\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained',\n",
+            "            checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0]),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2]),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)))\n",
+            "dataset_type = 'CocoDataset'\n",
+            "data_root = 'data/multisports/'\n",
+            "backend_args = None\n",
+            "train_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='RandomChoiceResize',\n",
+            "        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                (1333, 768), (1333, 800)],\n",
+            "        keep_ratio=True),\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='RandomChoiceResize',\n",
+            "                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                        (1333, 768), (1333, 800)],\n",
+            "                keep_ratio=True),\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "test_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "val_cfg = dict(type='ValLoop')\n",
+            "test_cfg = dict(type='TestLoop')\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n",
+            "auto_scale_lr = dict(enable=False, base_batch_size=16)\n",
+            "default_scope = 'mmdet'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='DetVisualizationHook'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='visualizer')\n",
+            "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'\n",
+            "resume = False\n",
+            "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n",
+            "launcher = 'none'\n",
+            "work_dir = 'work_dirs/det_model'\n",
+            "\n",
+            "06/15 06:02:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:02:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "loading annotations into memory...\n",
+            "Done (t=0.01s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "06/15 06:02:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: open-mmlab://detectron2/resnet50_caffe\n",
+            "06/15 06:02:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by openmmlab backend from path: open-mmlab://detectron2/resnet50_caffe\n",
+            "Downloading: \"https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth\" to /root/.cache/torch/hub/checkpoints/resnet50_msra-5891d200.pth\n",
+            "100% 89.9M/89.9M [00:02<00:00, 34.8MB/s]\n",
+            "06/15 06:02:21 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: conv1.bias\n",
+            "\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\" to /root/.cache/torch/hub/checkpoints/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "100% 158M/158M [00:04<00:00, 37.4MB/s]\n",
+            "06/15 06:02:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "06/15 06:02:26 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+            "06/15 06:02:26 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n",
+            "06/15 06:02:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/det_model.\n",
+            "06/15 06:02:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 50/118]  lr: 5.0000e-03  eta: 0:01:56  time: 0.6273  data_time: 0.0111  memory: 3414  loss: 0.5456  loss_rpn_cls: 0.0070  loss_rpn_bbox: 0.0167  loss_cls: 0.1887  acc: 93.2617  loss_bbox: 0.3332\n",
+            "06/15 06:03:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118]  lr: 5.0000e-03  eta: 0:01:16  time: 0.5041  data_time: 0.0078  memory: 3414  loss: 0.4017  loss_rpn_cls: 0.0027  loss_rpn_bbox: 0.0130  loss_cls: 0.1313  acc: 94.8242  loss_bbox: 0.2547\n",
+            "06/15 06:03:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_060208\n",
+            "06/15 06:03:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "06/15 06:03:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 50/120]    eta: 0:00:08  time: 0.1196  data_time: 0.0059  memory: 3414  \n",
+            "06/15 06:03:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120]    eta: 0:00:02  time: 0.1234  data_time: 0.0082  memory: 679  \n",
+            "06/15 06:03:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.05s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.01s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.872\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.709\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.886\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.964\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = 0.964\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = 0.964\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.963\n",
+            "06/15 06:03:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.872 -1.000 -1.000 -1.000 0.709 0.886\n",
+            "06/15 06:03:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: 0.9640  data_time: 0.0067  time: 0.1212\n",
+            "06/15 06:04:14 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 50/118]  lr: 5.0000e-03  eta: 0:00:37  time: 0.5316  data_time: 0.0094  memory: 3414  loss: 0.3385  loss_rpn_cls: 0.0012  loss_rpn_bbox: 0.0111  loss_cls: 0.1119  acc: 95.4102  loss_bbox: 0.2143\n",
+            "06/15 06:04:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118]  lr: 5.0000e-03  eta: 0:00:09  time: 0.5152  data_time: 0.0078  memory: 3414  loss: 0.3152  loss_rpn_cls: 0.0017  loss_rpn_bbox: 0.0109  loss_cls: 0.1050  acc: 94.7266  loss_bbox: 0.1977\n",
+            "06/15 06:04:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_060208\n",
+            "06/15 06:04:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n",
+            "06/15 06:04:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 50/120]    eta: 0:00:08  time: 0.1237  data_time: 0.0080  memory: 3414  \n",
+            "06/15 06:05:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120]    eta: 0:00:02  time: 0.1202  data_time: 0.0062  memory: 679  \n",
+            "06/15 06:05:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.04s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.01s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.907\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.762\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.910\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.960\n",
+            "06/15 06:05:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.907 -1.000 -1.000 -1.000 0.762 0.910\n",
+            "06/15 06:05:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: 0.9600  data_time: 0.0066  time: 0.1214\n",
+            "\u001b[32mTraining finished successfully. \u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "!mim train mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --work-dir work_dirs/det_model"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IxlO927KooZ1"
+      },
+      "source": [
+        "### 2.4 Generating Proposal BBoxes\n",
+        "\n",
+        "During the training of the spatiotemporal action detection model, we need to rely on proposals generated by the detection model, rather than annotated detection boxes. Therefore, we need to use a trained detection model to perform inference on the entire dataset and convert the resulting proposals into the required format for subsequent training.\n",
+        "\n",
+        "#### 2.4.1 Converting the Dataset to Coco Format\n",
+        "\n",
+        "We provide a script to convert the MultiSports dataset into an annotation format without ground truth, which is used for inference."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "e6C7D2DSooZ1",
+        "outputId": "878015d1-0fc7-4eb6-af77-4f61aefcf2b2"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[>>] 2350/2350, 2053.0 task/s, elapsed: 1s, ETA:     0s\n",
+            "save json file: data/multisports/rawframes/../annotations/ms_infer_anno.json\n"
+          ]
+        }
+      ],
+      "source": [
+        "!echo 'person' > data/multisports/annotations/label_map.txt\n",
+        "!python tools/images2coco.py \\\n",
+        "        data/multisports/rawframes \\\n",
+        "        data/multisports/annotations/label_map.txt \\\n",
+        "        ms_infer_anno.json"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fGL3t4MEooZ1"
+      },
+      "source": [
+        "#### 2.4.2 Inference for Generating Proposal Files\n"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gerYk6q6ooZ1"
+      },
+      "source": [
+        "The inference of MMDetection models is also based on MIM. For more testing commands, please refer to the MIM [tutorial](GitHub - open-mmlab/mim: MIM Installs OpenMMLab Packages).\n",
+        "\n",
+        "After the inference is completed, the results will be saved in 'data/multisports/ms_proposals.pkl'."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lutiaqzpooZ1",
+        "outputId": "b05db6e8-04de-4e1e-8d99-32f4c952d633"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Testing command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/test.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py work_dirs/det_model/epoch_2.pth --launcher none --out data/multisports/annotations/ms_det_proposals.pkl. \n",
+            "06/15 06:05:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 1289054678\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 1289054678\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:05:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "model = dict(\n",
+            "    type='FasterRCNN',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[103.53, 116.28, 123.675],\n",
+            "        std=[1.0, 1.0, 1.0],\n",
+            "        bgr_to_rgb=False,\n",
+            "        pad_size_divisor=32),\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=False),\n",
+            "        norm_eval=True,\n",
+            "        style='caffe',\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained',\n",
+            "            checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0]),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2]),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)))\n",
+            "dataset_type = 'CocoDataset'\n",
+            "data_root = 'data/multisports/'\n",
+            "backend_args = None\n",
+            "train_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='RandomChoiceResize',\n",
+            "        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                (1333, 768), (1333, 800)],\n",
+            "        keep_ratio=True),\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='RandomChoiceResize',\n",
+            "                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                        (1333, 768), (1333, 800)],\n",
+            "                keep_ratio=True),\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "test_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "val_cfg = dict(type='ValLoop')\n",
+            "test_cfg = dict(type='TestLoop')\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n",
+            "auto_scale_lr = dict(enable=False, base_batch_size=16)\n",
+            "default_scope = 'mmdet'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='DetVisualizationHook'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='visualizer')\n",
+            "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'work_dirs/det_model/epoch_2.pth'\n",
+            "resume = False\n",
+            "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n",
+            "launcher = 'none'\n",
+            "work_dir = './work_dirs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person'\n",
+            "\n",
+            "06/15 06:05:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:05:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "06/15 06:05:20 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The prefix is not set in metric class DumpDetResults.\n",
+            "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n",
+            "06/15 06:05:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from work_dirs/det_model/epoch_2.pth\n",
+            "06/15 06:05:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [  50/2350]    eta: 0:05:50  time: 0.1523  data_time: 0.0084  memory: 512  \n",
+            "06/15 06:05:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 100/2350]    eta: 0:05:05  time: 0.1191  data_time: 0.0042  memory: 512  \n",
+            "06/15 06:05:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 150/2350]    eta: 0:04:45  time: 0.1178  data_time: 0.0023  memory: 512  \n",
+            "06/15 06:05:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 200/2350]    eta: 0:04:36  time: 0.1255  data_time: 0.0074  memory: 512  \n",
+            "06/15 06:05:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 250/2350]    eta: 0:04:26  time: 0.1205  data_time: 0.0031  memory: 512  \n",
+            "06/15 06:05:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 300/2350]    eta: 0:04:19  time: 0.1238  data_time: 0.0063  memory: 512  \n",
+            "06/15 06:06:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 350/2350]    eta: 0:04:11  time: 0.1206  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:06:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 400/2350]    eta: 0:04:03  time: 0.1178  data_time: 0.0030  memory: 512  \n",
+            "06/15 06:06:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 450/2350]    eta: 0:03:56  time: 0.1212  data_time: 0.0058  memory: 512  \n",
+            "06/15 06:06:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 500/2350]    eta: 0:03:48  time: 0.1165  data_time: 0.0031  memory: 512  \n",
+            "06/15 06:06:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 550/2350]    eta: 0:03:41  time: 0.1202  data_time: 0.0061  memory: 512  \n",
+            "06/15 06:06:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 600/2350]    eta: 0:03:34  time: 0.1179  data_time: 0.0044  memory: 512  \n",
+            "06/15 06:06:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 650/2350]    eta: 0:03:27  time: 0.1156  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:06:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 700/2350]    eta: 0:03:21  time: 0.1212  data_time: 0.0058  memory: 512  \n",
+            "06/15 06:06:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 750/2350]    eta: 0:03:14  time: 0.1161  data_time: 0.0025  memory: 512  \n",
+            "06/15 06:06:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 800/2350]    eta: 0:03:08  time: 0.1200  data_time: 0.0058  memory: 512  \n",
+            "06/15 06:07:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 850/2350]    eta: 0:03:02  time: 0.1203  data_time: 0.0053  memory: 512  \n",
+            "06/15 06:07:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 900/2350]    eta: 0:02:55  time: 0.1177  data_time: 0.0030  memory: 512  \n",
+            "06/15 06:07:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 950/2350]    eta: 0:02:50  time: 0.1233  data_time: 0.0076  memory: 512  \n",
+            "06/15 06:07:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1000/2350]    eta: 0:02:43  time: 0.1172  data_time: 0.0025  memory: 512  \n",
+            "06/15 06:07:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1050/2350]    eta: 0:02:37  time: 0.1202  data_time: 0.0053  memory: 512  \n",
+            "06/15 06:07:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1100/2350]    eta: 0:02:31  time: 0.1208  data_time: 0.0059  memory: 512  \n",
+            "06/15 06:07:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1150/2350]    eta: 0:02:25  time: 0.1167  data_time: 0.0030  memory: 512  \n",
+            "06/15 06:07:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1200/2350]    eta: 0:02:19  time: 0.1212  data_time: 0.0053  memory: 512  \n",
+            "06/15 06:07:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1250/2350]    eta: 0:02:12  time: 0.1163  data_time: 0.0027  memory: 512  \n",
+            "06/15 06:07:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1300/2350]    eta: 0:02:06  time: 0.1188  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:08:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1350/2350]    eta: 0:02:00  time: 0.1201  data_time: 0.0056  memory: 512  \n",
+            "06/15 06:08:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1400/2350]    eta: 0:01:54  time: 0.1161  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:08:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1450/2350]    eta: 0:01:48  time: 0.1234  data_time: 0.0079  memory: 512  \n",
+            "06/15 06:08:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1500/2350]    eta: 0:01:42  time: 0.1165  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:08:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1550/2350]    eta: 0:01:36  time: 0.1191  data_time: 0.0043  memory: 512  \n",
+            "06/15 06:08:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1600/2350]    eta: 0:01:30  time: 0.1219  data_time: 0.0071  memory: 512  \n",
+            "06/15 06:08:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1650/2350]    eta: 0:01:24  time: 0.1166  data_time: 0.0026  memory: 512  \n",
+            "06/15 06:08:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1700/2350]    eta: 0:01:18  time: 0.1224  data_time: 0.0067  memory: 512  \n",
+            "06/15 06:08:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1750/2350]    eta: 0:01:12  time: 0.1175  data_time: 0.0032  memory: 512  \n",
+            "06/15 06:08:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1800/2350]    eta: 0:01:06  time: 0.1186  data_time: 0.0041  memory: 512  \n",
+            "06/15 06:09:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1850/2350]    eta: 0:01:00  time: 0.1227  data_time: 0.0067  memory: 512  \n",
+            "06/15 06:09:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1900/2350]    eta: 0:00:54  time: 0.1220  data_time: 0.0070  memory: 512  \n",
+            "06/15 06:09:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1950/2350]    eta: 0:00:48  time: 0.1229  data_time: 0.0081  memory: 512  \n",
+            "06/15 06:09:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2000/2350]    eta: 0:00:42  time: 0.1173  data_time: 0.0029  memory: 512  \n",
+            "06/15 06:09:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2050/2350]    eta: 0:00:36  time: 0.1184  data_time: 0.0037  memory: 512  \n",
+            "06/15 06:09:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2100/2350]    eta: 0:00:30  time: 0.1216  data_time: 0.0066  memory: 512  \n",
+            "06/15 06:09:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2150/2350]    eta: 0:00:24  time: 0.1166  data_time: 0.0026  memory: 512  \n",
+            "06/15 06:09:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2200/2350]    eta: 0:00:18  time: 0.1213  data_time: 0.0052  memory: 512  \n",
+            "06/15 06:09:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2250/2350]    eta: 0:00:12  time: 0.1180  data_time: 0.0033  memory: 512  \n",
+            "06/15 06:09:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2300/2350]    eta: 0:00:06  time: 0.1173  data_time: 0.0032  memory: 512  \n",
+            "06/15 06:10:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350]    eta: 0:00:00  time: 0.1203  data_time: 0.0048  memory: 512  \n",
+            "06/15 06:10:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.01s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.36s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.28s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n",
+            "06/15 06:10:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: -1.000 -1.000 -1.000 -1.000 -1.000 -1.000\n",
+            "06/15 06:10:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Results has been saved to data/multisports/annotations/ms_det_proposals.pkl.\n",
+            "06/15 06:10:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: -1.0000  data_time: 0.0047  time: 0.1202\n",
+            "\u001b[32mTesting finished successfully.\u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "!mim test mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --checkpoint work_dirs/det_model/epoch_2.pth \\\n",
+        "    --out data/multisports/annotations/ms_det_proposals.pkl"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jzWhc7ClooZ1"
+      },
+      "source": [
+        "## 3. Training the Spatio-temporal Action Detection Model\n",
+        "The provided annotation files and the proposal files generated by MMDetection need to be converted to the required format for training the spatiotemporal action detection model. We have provided relevant script to generate the specified format."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "W3slJsWHooZ2",
+        "outputId": "42a4b7be-91f8-4443-b693-ab40b743a14f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "loading test result...\n",
+            "[>>] 2350/2350, 3799.7 task/s, elapsed: 1s, ETA:     0s\n",
+            "\u001b[01;34mdata/multisports/annotations\u001b[00m\n",
+            "├── label_map.txt\n",
+            "├── ms_det_proposals.pkl\n",
+            "├── ms_infer_anno.json\n",
+            "├── multisports_det_anno_train.json\n",
+            "├── multisports_det_anno_val.json\n",
+            "├── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "├── multisports_proposals_train.pkl\n",
+            "├── multisports_proposals_val.pkl\n",
+            "├── multisports_train.csv\n",
+            "└── multisports_val.csv\n",
+            "\n",
+            "0 directories, 10 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Convert annotation files\n",
+        "!python ../../tools/data/multisports/parse_anno.py\n",
+        "\n",
+        "# Convert proposal files\n",
+        "!python tools/convert_proposals.py\n",
+        "\n",
+        "!tree data/multisports/annotations"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yRSSHmw0ooZ2"
+      },
+      "source": [
+        "### 3.2 Training the Spatio-temporal Action Detection Model\n",
+        "\n",
+        "MMAction2 already supports training on the MultiSports dataset. You just need to modify the path to the proposal file. For detailed configurations, please refer to the [config](configs/slowonly_k400_multisports.py) file. Since the training data is limited, the configuration uses a pre-trained model trained on the complete MultiSports dataset. When training with a custom dataset, you don't need to specify the `load_from` configuration."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "vwaay7NvooZ2",
+        "outputId": "add60ddd-2a40-4356-b120-1e7940043778"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training command is /usr/bin/python3 /content/mmaction2/mmaction/.mim/tools/train.py configs/slowonly_k400_multisports.py --launcher none --work-dir work_dirs/stad_model/. \n",
+            "06/15 06:10:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 1735696538\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 1735696538\n",
+            "    diff_rank_seed: False\n",
+            "    deterministic: False\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:10:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "default_scope = 'mmaction'\n",
+            "default_hooks = dict(\n",
+            "    runtime_info=dict(type='RuntimeInfoHook', _scope_='mmaction'),\n",
+            "    timer=dict(type='IterTimerHook', _scope_='mmaction'),\n",
+            "    logger=dict(\n",
+            "        type='LoggerHook', interval=20, ignore_last=False, _scope_='mmaction'),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook', _scope_='mmaction'),\n",
+            "    checkpoint=dict(\n",
+            "        type='CheckpointHook',\n",
+            "        interval=1,\n",
+            "        save_best='auto',\n",
+            "        _scope_='mmaction'),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook', _scope_='mmaction'),\n",
+            "    sync_buffers=dict(type='SyncBuffersHook', _scope_='mmaction'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "log_processor = dict(\n",
+            "    type='LogProcessor', window_size=20, by_epoch=True, _scope_='mmaction')\n",
+            "vis_backends = [dict(type='LocalVisBackend', _scope_='mmaction')]\n",
+            "visualizer = dict(\n",
+            "    type='ActionVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    _scope_='mmaction')\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth'\n",
+            "resume = False\n",
+            "url = 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n",
+            "num_classes = 66\n",
+            "model = dict(\n",
+            "    type='FastRCNN',\n",
+            "    _scope_='mmdet',\n",
+            "    init_cfg=dict(\n",
+            "        type='Pretrained',\n",
+            "        checkpoint=\n",
+            "        'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n",
+            "    ),\n",
+            "    backbone=dict(\n",
+            "        type='mmaction.ResNet3dSlowOnly',\n",
+            "        depth=50,\n",
+            "        pretrained=None,\n",
+            "        pretrained2d=False,\n",
+            "        lateral=False,\n",
+            "        num_stages=4,\n",
+            "        conv1_kernel=(1, 7, 7),\n",
+            "        conv1_stride_t=1,\n",
+            "        pool1_stride_t=1,\n",
+            "        spatial_strides=(1, 2, 2, 1)),\n",
+            "    roi_head=dict(\n",
+            "        type='AVARoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor3D',\n",
+            "            roi_layer_type='RoIAlign',\n",
+            "            output_size=8,\n",
+            "            with_temporal_pool=True),\n",
+            "        bbox_head=dict(\n",
+            "            type='BBoxHeadAVA',\n",
+            "            in_channels=2048,\n",
+            "            num_classes=66,\n",
+            "            multilabel=False,\n",
+            "            dropout_ratio=0.5)),\n",
+            "    data_preprocessor=dict(\n",
+            "        type='mmaction.ActionDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        format_shape='NCTHW'),\n",
+            "    train_cfg=dict(\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssignerAVA',\n",
+            "                pos_iou_thr=0.9,\n",
+            "                neg_iou_thr=0.9,\n",
+            "                min_pos_iou=0.9),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=32,\n",
+            "                pos_fraction=1,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=1.0)),\n",
+            "    test_cfg=dict(rcnn=None))\n",
+            "dataset_type = 'AVADataset'\n",
+            "data_root = 'data/multisports/trainval'\n",
+            "anno_root = 'data/multisports/annotations'\n",
+            "ann_file_train = 'data/multisports/annotations/multisports_train.csv'\n",
+            "ann_file_val = 'data/multisports/annotations/multisports_val.csv'\n",
+            "gt_file = 'data/multisports/annotations/multisports_GT.pkl'\n",
+            "proposal_file_train = 'data/multisports/annotations/multisports_proposals_train.pkl'\n",
+            "proposal_file_val = 'data/multisports/annotations/multisports_proposals_val.pkl'\n",
+            "file_client_args = dict(io_backend='disk')\n",
+            "train_pipeline = [\n",
+            "    dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='SampleAVAFrames',\n",
+            "        clip_len=4,\n",
+            "        frame_interval=16,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='DecordDecode', _scope_='mmaction'),\n",
+            "    dict(type='RandomRescale', scale_range=(256, 320), _scope_='mmaction'),\n",
+            "    dict(type='RandomCrop', size=256, _scope_='mmaction'),\n",
+            "    dict(type='Flip', flip_ratio=0.5, _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='FormatShape',\n",
+            "        input_format='NCTHW',\n",
+            "        collapse=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='PackActionInputs', _scope_='mmaction')\n",
+            "]\n",
+            "val_pipeline = [\n",
+            "    dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='SampleAVAFrames',\n",
+            "        clip_len=4,\n",
+            "        frame_interval=16,\n",
+            "        test_mode=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='DecordDecode', _scope_='mmaction'),\n",
+            "    dict(type='Resize', scale=(-1, 256), _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='FormatShape',\n",
+            "        input_format='NCTHW',\n",
+            "        collapse=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='PackActionInputs', _scope_='mmaction')\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_train.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='RandomRescale', scale_range=(256, 320)),\n",
+            "            dict(type='RandomCrop', size=256),\n",
+            "            dict(type='Flip', flip_ratio=0.5),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_proposals_train.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_val.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(\n",
+            "                type='SampleAVAFrames',\n",
+            "                clip_len=4,\n",
+            "                frame_interval=16,\n",
+            "                test_mode=True),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='Resize', scale=(-1, 256)),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_proposals_val.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        test_mode=True,\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=8,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_val.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(\n",
+            "                type='SampleAVAFrames',\n",
+            "                clip_len=4,\n",
+            "                frame_interval=16,\n",
+            "                test_mode=True),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='Resize', scale=(-1, 256)),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_dense_proposals_val.recall_96.13.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        test_mode=True,\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "val_evaluator = dict(\n",
+            "    type='MultiSportsMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_GT.pkl',\n",
+            "    _scope_='mmaction')\n",
+            "test_evaluator = dict(\n",
+            "    type='MultiSportsMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_GT.pkl',\n",
+            "    _scope_='mmaction')\n",
+            "train_cfg = dict(\n",
+            "    type='EpochBasedTrainLoop',\n",
+            "    max_epochs=8,\n",
+            "    val_begin=1,\n",
+            "    val_interval=1,\n",
+            "    _scope_='mmaction')\n",
+            "val_cfg = dict(type='ValLoop', _scope_='mmaction')\n",
+            "test_cfg = dict(type='TestLoop', _scope_='mmaction')\n",
+            "param_scheduler = [\n",
+            "    dict(\n",
+            "        type='LinearLR',\n",
+            "        start_factor=0.1,\n",
+            "        by_epoch=True,\n",
+            "        begin=0,\n",
+            "        end=5,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='MultiStepLR',\n",
+            "        begin=0,\n",
+            "        end=8,\n",
+            "        by_epoch=True,\n",
+            "        milestones=[6, 7],\n",
+            "        gamma=0.1,\n",
+            "        _scope_='mmaction')\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    optimizer=dict(\n",
+            "        type='SGD',\n",
+            "        lr=0.01,\n",
+            "        momentum=0.9,\n",
+            "        weight_decay=1e-05,\n",
+            "        _scope_='mmaction'),\n",
+            "    clip_grad=dict(max_norm=5, norm_type=2))\n",
+            "launcher = 'none'\n",
+            "work_dir = 'work_dirs/stad_model/'\n",
+            "randomness = dict(seed=None, diff_rank_seed=False, deterministic=False)\n",
+            "\n",
+            "06/15 06:10:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:10:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) SyncBuffersHook                    \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) SyncBuffersHook                    \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "06/15 06:10:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 236 out of 236 frames are valid.\n",
+            "06/15 06:10:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 120 out of 120 frames are valid.\n",
+            "06/15 06:10:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "06/15 06:10:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "100% 124M/124M [00:01<00:00, 103MB/s]\n",
+            "06/15 06:10:28 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: cls_head.fc_cls.weight, cls_head.fc_cls.bias\n",
+            "\n",
+            "missing keys in source state_dict: roi_head.bbox_head.fc_cls.weight, roi_head.bbox_head.fc_cls.bias\n",
+            "\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "100% 122M/122M [00:03<00:00, 36.1MB/s]\n",
+            "06/15 06:10:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "06/15 06:10:32 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+            "06/15 06:10:32 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n",
+            "06/15 06:10:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/stad_model.\n",
+            "06/15 06:10:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 20/118]  lr: 1.0000e-03  eta: 0:06:07  time: 0.3982  data_time: 0.0431  memory: 1383  grad_norm: 13.0844  loss: 1.3834  recall@thr=0.5: 0.5385  prec@thr=0.5: 0.5385  recall@top3: 0.8462  prec@top3: 0.2821  recall@top5: 0.8462  prec@top5: 0.1692  loss_action_cls: 1.3834\n",
+            "06/15 06:10:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 40/118]  lr: 1.0000e-03  eta: 0:05:32  time: 0.3383  data_time: 0.0732  memory: 1383  grad_norm: 4.6786  loss: 0.6001  recall@thr=0.5: 0.9444  prec@thr=0.5: 0.9444  recall@top3: 0.9444  prec@top3: 0.3148  recall@top5: 0.9444  prec@top5: 0.1889  loss_action_cls: 0.6001\n",
+            "06/15 06:10:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 60/118]  lr: 1.0000e-03  eta: 0:04:59  time: 0.2784  data_time: 0.0300  memory: 1383  grad_norm: 2.9446  loss: 0.5144  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5144\n",
+            "06/15 06:10:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 80/118]  lr: 1.0000e-03  eta: 0:04:36  time: 0.2646  data_time: 0.0144  memory: 1383  grad_norm: 1.7695  loss: 0.4988  recall@thr=0.5: 0.6923  prec@thr=0.5: 0.6923  recall@top3: 0.6923  prec@top3: 0.2308  recall@top5: 0.6923  prec@top5: 0.1385  loss_action_cls: 0.4988\n",
+            "06/15 06:11:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118]  lr: 1.0000e-03  eta: 0:04:35  time: 0.3502  data_time: 0.0839  memory: 1383  grad_norm: 2.4095  loss: 0.3218  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 0.9333  prec@top3: 0.3111  recall@top5: 0.9333  prec@top5: 0.1867  loss_action_cls: 0.3218\n",
+            "06/15 06:11:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:11:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][118/118]  lr: 1.0000e-03  eta: 0:04:20  time: 0.2563  data_time: 0.0102  memory: 1383  grad_norm: 1.8156  loss: 0.3895  recall@thr=0.5: 0.8125  prec@thr=0.5: 0.8125  recall@top3: 0.9375  prec@top3: 0.3125  recall@top5: 0.9375  prec@top5: 0.1875  loss_action_cls: 0.3895\n",
+            "06/15 06:11:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "06/15 06:11:14 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 20/120]    eta: 0:00:16  time: 0.1669  data_time: 0.1073  memory: 466  \n",
+            "06/15 06:11:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 40/120]    eta: 0:00:13  time: 0.1698  data_time: 0.1145  memory: 466  \n",
+            "06/15 06:11:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 60/120]    eta: 0:00:09  time: 0.1428  data_time: 0.0896  memory: 466  \n",
+            "06/15 06:11:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 80/120]    eta: 0:00:05  time: 0.0998  data_time: 0.0504  memory: 466  \n",
+            "06/15 06:11:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120]    eta: 0:00:02  time: 0.1122  data_time: 0.0612  memory: 466  \n",
+            "06/15 06:11:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    eta: 0:00:00  time: 0.1031  data_time: 0.0528  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    59.66\n",
+            "aerobic split jump      30.80\n",
+            "aerobic scissors leap    88.34\n",
+            "aerobic turn            98.48\n",
+            "mAP                     69.32\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump    25.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     56.25\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump    25.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    50.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     43.75\n",
+            "06/15 06:11:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    mAP/frameAP: 69.3181  mAP/v_map@0.2: 56.2500  mAP/v_map@0.5: 43.7500  mAP/v_map_0.05:0.45: 55.1389  mAP/v_map_0.10:0.90: 41.2500  mAP/v_map_0.50:0.95: 28.1750  data_time: 0.0793  time: 0.1324\n",
+            "06/15 06:11:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - The best checkpoint with 69.3181 mAP/frameAP at 1 epoch is saved to best_mAP_frameAP_epoch_1.pth.\n",
+            "06/15 06:11:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 20/118]  lr: 3.2500e-03  eta: 0:04:10  time: 0.2884  data_time: 0.0401  memory: 1383  grad_norm: 1.3823  loss: 0.3596  recall@thr=0.5: 0.6923  prec@thr=0.5: 0.6923  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3596\n",
+            "06/15 06:11:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 40/118]  lr: 3.2500e-03  eta: 0:04:00  time: 0.2728  data_time: 0.0204  memory: 1383  grad_norm: 1.2185  loss: 0.5274  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5274\n",
+            "06/15 06:11:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 60/118]  lr: 3.2500e-03  eta: 0:03:56  time: 0.3296  data_time: 0.0699  memory: 1383  grad_norm: 1.7120  loss: 0.3599  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3599\n",
+            "06/15 06:11:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 80/118]  lr: 3.2500e-03  eta: 0:03:46  time: 0.2584  data_time: 0.0120  memory: 1383  grad_norm: 1.7462  loss: 0.2598  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2598\n",
+            "06/15 06:12:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118]  lr: 3.2500e-03  eta: 0:03:39  time: 0.2858  data_time: 0.0263  memory: 1383  grad_norm: 0.8975  loss: 0.3959  recall@thr=0.5: 0.7692  prec@thr=0.5: 0.7692  recall@top3: 0.9231  prec@top3: 0.3077  recall@top5: 0.9231  prec@top5: 0.1846  loss_action_cls: 0.3959\n",
+            "06/15 06:12:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:12:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][118/118]  lr: 3.2500e-03  eta: 0:03:35  time: 0.3381  data_time: 0.0807  memory: 1383  grad_norm: 0.5466  loss: 0.4871  recall@thr=0.5: 0.8333  prec@thr=0.5: 0.8333  recall@top3: 0.8333  prec@top3: 0.2778  recall@top5: 0.8333  prec@top5: 0.1667  loss_action_cls: 0.4871\n",
+            "06/15 06:12:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n",
+            "06/15 06:12:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 20/120]    eta: 0:00:12  time: 0.1230  data_time: 0.0693  memory: 466  \n",
+            "06/15 06:12:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 40/120]    eta: 0:00:09  time: 0.1138  data_time: 0.0632  memory: 466  \n",
+            "06/15 06:12:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 60/120]    eta: 0:00:07  time: 0.1214  data_time: 0.0672  memory: 466  \n",
+            "06/15 06:12:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 80/120]    eta: 0:00:05  time: 0.1539  data_time: 0.1001  memory: 466  \n",
+            "06/15 06:12:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120]    eta: 0:00:02  time: 0.1488  data_time: 0.0936  memory: 466  \n",
+            "06/15 06:12:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    eta: 0:00:00  time: 0.1030  data_time: 0.0539  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    39.91\n",
+            "aerobic split jump      29.66\n",
+            "aerobic scissors leap    90.70\n",
+            "aerobic turn            96.92\n",
+            "mAP                     64.30\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     55.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:12:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    mAP/frameAP: 64.2982  mAP/v_map@0.2: 55.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 53.8889  mAP/v_map_0.10:0.90: 34.5833  mAP/v_map_0.50:0.95: 19.1250  data_time: 0.0744  time: 0.1270\n",
+            "06/15 06:12:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 20/118]  lr: 5.5000e-03  eta: 0:03:28  time: 0.2786  data_time: 0.0358  memory: 1383  grad_norm: 1.0935  loss: 0.3780  recall@thr=0.5: 0.8667  prec@thr=0.5: 0.8667  recall@top3: 0.8667  prec@top3: 0.2889  recall@top5: 0.8667  prec@top5: 0.1733  loss_action_cls: 0.3780\n",
+            "06/15 06:12:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 40/118]  lr: 5.5000e-03  eta: 0:03:22  time: 0.3217  data_time: 0.0573  memory: 1383  grad_norm: 1.4278  loss: 0.3261  recall@thr=0.5: 0.8750  prec@thr=0.5: 0.8750  recall@top3: 0.9375  prec@top3: 0.3125  recall@top5: 0.9375  prec@top5: 0.1875  loss_action_cls: 0.3261\n",
+            "06/15 06:12:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 60/118]  lr: 5.5000e-03  eta: 0:03:15  time: 0.2823  data_time: 0.0358  memory: 1383  grad_norm: 0.6230  loss: 0.4514  recall@thr=0.5: 0.9286  prec@thr=0.5: 0.9286  recall@top3: 0.9286  prec@top3: 0.3095  recall@top5: 0.9286  prec@top5: 0.1857  loss_action_cls: 0.4514\n",
+            "06/15 06:12:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 80/118]  lr: 5.5000e-03  eta: 0:03:08  time: 0.2561  data_time: 0.0115  memory: 1383  grad_norm: 0.1768  loss: 0.3241  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3241\n",
+            "06/15 06:12:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][100/118]  lr: 5.5000e-03  eta: 0:03:02  time: 0.3094  data_time: 0.0422  memory: 1383  grad_norm: 0.4979  loss: 0.4081  recall@thr=0.5: 0.8333  prec@thr=0.5: 0.8333  recall@top3: 0.8333  prec@top3: 0.2778  recall@top5: 0.8333  prec@top5: 0.1667  loss_action_cls: 0.4081\n",
+            "06/15 06:13:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:13:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][118/118]  lr: 5.5000e-03  eta: 0:02:56  time: 0.2776  data_time: 0.0266  memory: 1383  grad_norm: 0.7488  loss: 0.4131  recall@thr=0.5: 0.6667  prec@thr=0.5: 0.6667  recall@top3: 0.6667  prec@top3: 0.2222  recall@top5: 0.6667  prec@top5: 0.1333  loss_action_cls: 0.4131\n",
+            "06/15 06:13:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 3 epochs\n",
+            "06/15 06:13:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 20/120]    eta: 0:00:11  time: 0.1182  data_time: 0.0691  memory: 466  \n",
+            "06/15 06:13:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 40/120]    eta: 0:00:09  time: 0.1132  data_time: 0.0628  memory: 466  \n",
+            "06/15 06:13:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 60/120]    eta: 0:00:07  time: 0.1542  data_time: 0.0996  memory: 466  \n",
+            "06/15 06:13:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 80/120]    eta: 0:00:05  time: 0.1479  data_time: 0.0937  memory: 466  \n",
+            "06/15 06:13:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][100/120]    eta: 0:00:02  time: 0.1232  data_time: 0.0726  memory: 466  \n",
+            "06/15 06:13:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120]    eta: 0:00:00  time: 0.1029  data_time: 0.0529  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    29.65\n",
+            "aerobic split jump      20.83\n",
+            "aerobic scissors leap    90.63\n",
+            "aerobic turn            97.10\n",
+            "mAP                     59.55\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     50.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:13:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120]    mAP/frameAP: 59.5538  mAP/v_map@0.2: 50.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 50.0000  mAP/v_map_0.10:0.90: 32.9167  mAP/v_map_0.50:0.95: 19.1250  data_time: 0.0750  time: 0.1264\n",
+            "06/15 06:13:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 20/118]  lr: 7.7500e-03  eta: 0:02:50  time: 0.3089  data_time: 0.0514  memory: 1383  grad_norm: 0.2046  loss: 0.3238  recall@thr=0.5: 0.9091  prec@thr=0.5: 0.9091  recall@top3: 0.9091  prec@top3: 0.3030  recall@top5: 0.9091  prec@top5: 0.1818  loss_action_cls: 0.3238\n",
+            "06/15 06:13:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 40/118]  lr: 7.7500e-03  eta: 0:02:46  time: 0.3790  data_time: 0.0937  memory: 1383  grad_norm: 0.7468  loss: 0.4123  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4123\n",
+            "06/15 06:13:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 60/118]  lr: 7.7500e-03  eta: 0:02:39  time: 0.2685  data_time: 0.0171  memory: 1383  grad_norm: 0.1904  loss: 0.4407  recall@thr=0.5: 0.6667  prec@thr=0.5: 0.6667  recall@top3: 0.6667  prec@top3: 0.2222  recall@top5: 0.6667  prec@top5: 0.1333  loss_action_cls: 0.4407\n",
+            "06/15 06:13:42 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 80/118]  lr: 7.7500e-03  eta: 0:02:32  time: 0.2546  data_time: 0.0100  memory: 1383  grad_norm: 0.1966  loss: 0.4266  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4266\n",
+            "06/15 06:13:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][100/118]  lr: 7.7500e-03  eta: 0:02:27  time: 0.3283  data_time: 0.0548  memory: 1383  grad_norm: 0.3165  loss: 0.3308  recall@thr=0.5: 0.8000  prec@thr=0.5: 0.8000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3308\n",
+            "06/15 06:13:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:13:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][118/118]  lr: 7.7500e-03  eta: 0:02:21  time: 0.2671  data_time: 0.0151  memory: 1383  grad_norm: 0.1487  loss: 0.3003  recall@thr=0.5: 0.8333  prec@thr=0.5: 0.8333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3003\n",
+            "06/15 06:13:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 4 epochs\n",
+            "06/15 06:13:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 20/120]    eta: 0:00:12  time: 0.1273  data_time: 0.0729  memory: 466  \n",
+            "06/15 06:14:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 40/120]    eta: 0:00:10  time: 0.1306  data_time: 0.0797  memory: 466  \n",
+            "06/15 06:14:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 60/120]    eta: 0:00:08  time: 0.1539  data_time: 0.0979  memory: 466  \n",
+            "06/15 06:14:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 80/120]    eta: 0:00:05  time: 0.1355  data_time: 0.0815  memory: 466  \n",
+            "06/15 06:14:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][100/120]    eta: 0:00:02  time: 0.1132  data_time: 0.0646  memory: 466  \n",
+            "06/15 06:14:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120]    eta: 0:00:00  time: 0.1050  data_time: 0.0553  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    23.92\n",
+            "aerobic split jump      19.60\n",
+            "aerobic scissors leap    91.02\n",
+            "aerobic turn            96.05\n",
+            "mAP                     57.64\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     50.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:14:11 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120]    mAP/frameAP: 57.6444  mAP/v_map@0.2: 50.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 50.0000  mAP/v_map_0.10:0.90: 32.9167  mAP/v_map_0.50:0.95: 18.3250  data_time: 0.0753  time: 0.1274\n",
+            "06/15 06:14:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 20/118]  lr: 1.0000e-02  eta: 0:02:14  time: 0.2810  data_time: 0.0329  memory: 1383  grad_norm: 0.6113  loss: 0.4312  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 0.8182  prec@top3: 0.2727  recall@top5: 0.8182  prec@top5: 0.1636  loss_action_cls: 0.4312\n",
+            "06/15 06:14:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 40/118]  lr: 1.0000e-02  eta: 0:02:09  time: 0.3316  data_time: 0.0732  memory: 1383  grad_norm: 0.2282  loss: 0.3932  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 0.8182  prec@top3: 0.2727  recall@top5: 0.8182  prec@top5: 0.1636  loss_action_cls: 0.3932\n",
+            "06/15 06:14:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 60/118]  lr: 1.0000e-02  eta: 0:02:03  time: 0.2738  data_time: 0.0286  memory: 1383  grad_norm: 0.2938  loss: 0.3828  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 0.8571  prec@top3: 0.2857  recall@top5: 0.8571  prec@top5: 0.1714  loss_action_cls: 0.3828\n",
+            "06/15 06:14:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 80/118]  lr: 1.0000e-02  eta: 0:01:56  time: 0.2756  data_time: 0.0192  memory: 1383  grad_norm: 0.1112  loss: 0.3722  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3722\n",
+            "06/15 06:14:41 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][100/118]  lr: 1.0000e-02  eta: 0:01:51  time: 0.3193  data_time: 0.0573  memory: 1383  grad_norm: 0.6399  loss: 0.4427  recall@thr=0.5: 0.8000  prec@thr=0.5: 0.8000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4427\n",
+            "06/15 06:14:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:14:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][118/118]  lr: 1.0000e-02  eta: 0:01:45  time: 0.2535  data_time: 0.0093  memory: 1383  grad_norm: 0.0985  loss: 0.2719  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2719\n",
+            "06/15 06:14:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 5 epochs\n",
+            "06/15 06:14:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 20/120]    eta: 0:00:13  time: 0.1329  data_time: 0.0774  memory: 466  \n",
+            "06/15 06:14:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 40/120]    eta: 0:00:12  time: 0.1787  data_time: 0.1259  memory: 466  \n",
+            "06/15 06:14:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 60/120]    eta: 0:00:08  time: 0.1363  data_time: 0.0829  memory: 466  \n",
+            "06/15 06:14:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 80/120]    eta: 0:00:05  time: 0.1012  data_time: 0.0513  memory: 466  \n",
+            "06/15 06:15:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][100/120]    eta: 0:00:02  time: 0.1095  data_time: 0.0593  memory: 466  \n",
+            "06/15 06:15:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120]    eta: 0:00:00  time: 0.1033  data_time: 0.0536  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    14.21\n",
+            "aerobic split jump      15.37\n",
+            "aerobic scissors leap    91.25\n",
+            "aerobic turn            91.43\n",
+            "mAP                     53.06\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn            80.00\n",
+            "mAP                     45.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     14.00\n",
+            "06/15 06:15:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120]    mAP/frameAP: 53.0627  mAP/v_map@0.2: 45.0000  mAP/v_map@0.5: 14.0000  mAP/v_map_0.05:0.45: 40.0000  mAP/v_map_0.10:0.90: 22.4444  mAP/v_map_0.50:0.95: 7.0250  data_time: 0.0749  time: 0.1267\n",
+            "06/15 06:15:09 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 20/118]  lr: 1.0000e-02  eta: 0:01:39  time: 0.3193  data_time: 0.0634  memory: 1383  grad_norm: 0.5229  loss: 0.3929  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3929\n",
+            "06/15 06:15:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 40/118]  lr: 1.0000e-02  eta: 0:01:33  time: 0.2972  data_time: 0.0439  memory: 1383  grad_norm: 0.4621  loss: 0.2891  recall@thr=0.5: 0.7692  prec@thr=0.5: 0.7692  recall@top3: 0.9231  prec@top3: 0.3077  recall@top5: 0.9231  prec@top5: 0.1846  loss_action_cls: 0.2891\n",
+            "06/15 06:15:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 60/118]  lr: 1.0000e-02  eta: 0:01:27  time: 0.2567  data_time: 0.0127  memory: 1383  grad_norm: 0.2534  loss: 0.3438  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 0.9333  prec@top3: 0.3111  recall@top5: 0.9333  prec@top5: 0.1867  loss_action_cls: 0.3438\n",
+            "06/15 06:15:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 80/118]  lr: 1.0000e-02  eta: 0:01:21  time: 0.3277  data_time: 0.0645  memory: 1383  grad_norm: 0.0856  loss: 0.1859  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 0.8571  prec@top3: 0.2857  recall@top5: 0.8571  prec@top5: 0.1714  loss_action_cls: 0.1859\n",
+            "06/15 06:15:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][100/118]  lr: 1.0000e-02  eta: 0:01:15  time: 0.2995  data_time: 0.0503  memory: 1383  grad_norm: 0.3619  loss: 0.3205  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3205\n",
+            "06/15 06:15:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:15:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][118/118]  lr: 1.0000e-02  eta: 0:01:10  time: 0.2619  data_time: 0.0190  memory: 1383  grad_norm: 0.3812  loss: 0.3911  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3911\n",
+            "06/15 06:15:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 6 epochs\n",
+            "06/15 06:15:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 20/120]    eta: 0:00:17  time: 0.1739  data_time: 0.1178  memory: 466  \n",
+            "06/15 06:15:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 40/120]    eta: 0:00:13  time: 0.1519  data_time: 0.1032  memory: 466  \n",
+            "06/15 06:15:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 60/120]    eta: 0:00:08  time: 0.1031  data_time: 0.0536  memory: 466  \n",
+            "06/15 06:15:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 80/120]    eta: 0:00:05  time: 0.0998  data_time: 0.0505  memory: 466  \n",
+            "06/15 06:15:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][100/120]    eta: 0:00:02  time: 0.1126  data_time: 0.0620  memory: 466  \n",
+            "06/15 06:15:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120]    eta: 0:00:00  time: 0.0995  data_time: 0.0506  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    10.49\n",
+            "aerobic split jump      14.53\n",
+            "aerobic scissors leap    90.24\n",
+            "aerobic turn            87.53\n",
+            "mAP                     50.70\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn            40.00\n",
+            "mAP                     35.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn            40.00\n",
+            "mAP                     19.00\n",
+            "06/15 06:15:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120]    mAP/frameAP: 50.6970  mAP/v_map@0.2: 35.0000  mAP/v_map@0.5: 19.0000  mAP/v_map_0.05:0.45: 35.0000  mAP/v_map_0.10:0.90: 20.7778  mAP/v_map_0.50:0.95: 8.4000  data_time: 0.0724  time: 0.1229\n",
+            "06/15 06:16:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 20/118]  lr: 1.0000e-03  eta: 0:01:04  time: 0.3578  data_time: 0.0847  memory: 1383  grad_norm: 0.5369  loss: 0.3628  recall@thr=0.5: 0.9167  prec@thr=0.5: 0.9167  recall@top3: 0.9167  prec@top3: 0.3056  recall@top5: 0.9167  prec@top5: 0.1833  loss_action_cls: 0.3628\n",
+            "06/15 06:16:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 40/118]  lr: 1.0000e-03  eta: 0:00:58  time: 0.2652  data_time: 0.0202  memory: 1383  grad_norm: 0.1603  loss: 0.2293  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2293\n",
+            "06/15 06:16:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 60/118]  lr: 1.0000e-03  eta: 0:00:52  time: 0.2710  data_time: 0.0178  memory: 1383  grad_norm: 0.3857  loss: 0.2737  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2737\n",
+            "06/15 06:16:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 80/118]  lr: 1.0000e-03  eta: 0:00:46  time: 0.3420  data_time: 0.0698  memory: 1383  grad_norm: 0.1271  loss: 0.2149  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2149\n",
+            "06/15 06:16:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][100/118]  lr: 1.0000e-03  eta: 0:00:40  time: 0.2673  data_time: 0.0232  memory: 1383  grad_norm: 0.0990  loss: 0.2749  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2749\n",
+            "06/15 06:16:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:16:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][118/118]  lr: 1.0000e-03  eta: 0:00:34  time: 0.2612  data_time: 0.0156  memory: 1383  grad_norm: 0.1387  loss: 0.3211  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3211\n",
+            "06/15 06:16:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 7 epochs\n",
+            "06/15 06:16:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 20/120]    eta: 0:00:16  time: 0.1657  data_time: 0.1063  memory: 466  \n",
+            "06/15 06:16:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 40/120]    eta: 0:00:11  time: 0.1164  data_time: 0.0654  memory: 466  \n",
+            "06/15 06:16:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 60/120]    eta: 0:00:07  time: 0.1053  data_time: 0.0546  memory: 466  \n",
+            "06/15 06:16:42 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 80/120]    eta: 0:00:04  time: 0.1005  data_time: 0.0511  memory: 466  \n",
+            "06/15 06:16:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][100/120]    eta: 0:00:02  time: 0.1035  data_time: 0.0533  memory: 466  \n",
+            "06/15 06:16:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120]    eta: 0:00:00  time: 0.1382  data_time: 0.0850  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    11.65\n",
+            "aerobic split jump      15.62\n",
+            "aerobic scissors leap    89.83\n",
+            "aerobic turn            93.96\n",
+            "mAP                     52.77\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn            80.00\n",
+            "mAP                     45.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    38.67\n",
+            "aerobic turn            20.00\n",
+            "mAP                     14.67\n",
+            "06/15 06:16:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120]    mAP/frameAP: 52.7652  mAP/v_map@0.2: 45.0000  mAP/v_map@0.5: 14.6667  mAP/v_map_0.05:0.45: 40.6944  mAP/v_map_0.10:0.90: 22.6389  mAP/v_map_0.50:0.95: 6.6833  data_time: 0.0691  time: 0.1213\n",
+            "06/15 06:16:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 20/118]  lr: 1.0000e-04  eta: 0:00:29  time: 0.3243  data_time: 0.0649  memory: 1383  grad_norm: 0.1808  loss: 0.3648  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3648\n",
+            "06/15 06:16:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 40/118]  lr: 1.0000e-04  eta: 0:00:23  time: 0.2578  data_time: 0.0117  memory: 1383  grad_norm: 0.0784  loss: 0.2355  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2355\n",
+            "06/15 06:17:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 60/118]  lr: 1.0000e-04  eta: 0:00:17  time: 0.3075  data_time: 0.0490  memory: 1383  grad_norm: 0.1707  loss: 0.3776  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3776\n",
+            "06/15 06:17:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 80/118]  lr: 1.0000e-04  eta: 0:00:11  time: 0.3092  data_time: 0.0576  memory: 1383  grad_norm: 0.1387  loss: 0.3873  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 0.8182  prec@top3: 0.2727  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3873\n",
+            "06/15 06:17:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][100/118]  lr: 1.0000e-04  eta: 0:00:05  time: 0.2578  data_time: 0.0100  memory: 1383  grad_norm: 0.2137  loss: 0.3337  recall@thr=0.5: 0.8462  prec@thr=0.5: 0.8462  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3337\n",
+            "06/15 06:17:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_061017\n",
+            "06/15 06:17:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][118/118]  lr: 1.0000e-04  eta: 0:00:00  time: 0.2755  data_time: 0.0148  memory: 1383  grad_norm: 0.0712  loss: 0.2038  recall@thr=0.5: 0.9091  prec@thr=0.5: 0.9091  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2038\n",
+            "06/15 06:17:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 8 epochs\n",
+            "06/15 06:17:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 20/120]    eta: 0:00:11  time: 0.1180  data_time: 0.0649  memory: 466  \n",
+            "06/15 06:17:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 40/120]    eta: 0:00:09  time: 0.1168  data_time: 0.0667  memory: 466  \n",
+            "06/15 06:17:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 60/120]    eta: 0:00:06  time: 0.1026  data_time: 0.0535  memory: 466  \n",
+            "06/15 06:17:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 80/120]    eta: 0:00:04  time: 0.1017  data_time: 0.0533  memory: 466  \n",
+            "06/15 06:17:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][100/120]    eta: 0:00:02  time: 0.1444  data_time: 0.0915  memory: 466  \n",
+            "06/15 06:17:39 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120]    eta: 0:00:00  time: 0.1496  data_time: 0.0962  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    11.34\n",
+            "aerobic split jump      12.82\n",
+            "aerobic scissors leap    90.68\n",
+            "aerobic turn            90.47\n",
+            "mAP                     51.33\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn            80.00\n",
+            "mAP                     45.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    72.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     23.00\n",
+            "06/15 06:17:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120]    mAP/frameAP: 51.3281  mAP/v_map@0.2: 45.0000  mAP/v_map@0.5: 23.0000  mAP/v_map_0.05:0.45: 40.0000  mAP/v_map_0.10:0.90: 24.4444  mAP/v_map_0.50:0.95: 9.7250  data_time: 0.0704  time: 0.1216\n",
+            "\u001b[32mTraining finished successfully. \u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Train the model using MIM\n",
+        "!mim train mmaction2 configs/slowonly_k400_multisports.py \\\n",
+        "    --work-dir work_dirs/stad_model/"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yVjHqupPooZ2"
+      },
+      "source": [
+        "## 4. Inferring the Spatiotemporal Action Detection Model\n",
+        "\n",
+        "After training the detection model and the spatiotemporal action detection model, we can use the spatiotemporal action detection demo for inference and visualize the model's performance.\n",
+        "\n",
+        "Since the tutorial uses a limited training dataset, the model's performance is not optimal, so a pre-trained model is used for visualization."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "NQF1yrEhooZ3",
+        "outputId": "5331fbb6-7075-415c-f6f0-ec41c4b584a4"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n",
+            "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n",
+            "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n",
+            "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n",
+            "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n",
+            "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n",
+            "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n",
+            "Performing Human Detection for each frame\n",
+            "[>>] 99/99, 7.0 task/s, elapsed: 14s, ETA:     0s\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "Performing SpatioTemporal Action Detection for each clip\n",
+            "[>>] 99/99, 17.1 task/s, elapsed: 6s, ETA:     0sPerforming visualization\n",
+            "Moviepy - Building video data/demo_spatiotemporal_det.mp4.\n",
+            "Moviepy - Writing video data/demo_spatiotemporal_det.mp4\n",
+            "\n",
+            "Moviepy - Done !\n",
+            "Moviepy - video ready data/demo_spatiotemporal_det.mp4\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python ../../demo/demo_spatiotemporal_det.py \\\n",
+        "    data/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4 \\\n",
+        "    data/demo_spatiotemporal_det.mp4 \\\n",
+        "    --config configs/slowonly_k400_multisports.py \\\n",
+        "    --checkpoint https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth \\\n",
+        "    --det-config configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --det-checkpoint work_dirs/det_model/epoch_2.pth \\\n",
+        "    --det-score-thr 0.85 \\\n",
+        "    --action-score-thr 0.8 \\\n",
+        "    --label-map ../../tools/data/multisports/label_map.txt \\\n",
+        "    --predict-stepsize 8 \\\n",
+        "    --output-stepsize 1 \\\n",
+        "    --output-fps 24"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 741
+        },
+        "id": "9JmeIkh5ooZ3",
+        "outputId": "7fc38469-d8c4-4a02-81e7-ff93b88a62b2"
+      },
+      "outputs": [],
+      "source": [
+        "# Show Video\n",
+        "import moviepy.editor\n",
+        "moviepy.editor.ipython_display(\"data/demo_spatiotemporal_det.mp4\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "ipy_stad",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.0"
+    },
+    "orig_nbformat": 4
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/projects/stad_tutorial/demo_stad_zh_CN.ipynb b/projects/stad_tutorial/demo_stad_zh_CN.ipynb
new file mode 100644
index 0000000000..30df0f5624
--- /dev/null
+++ b/projects/stad_tutorial/demo_stad_zh_CN.ipynb
@@ -0,0 +1,4107 @@
+{
+  "cells": [
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "B74HkZjCxQ_6"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/open-mmlab/mmaction2/projects/stad_tutorial/demo_stad_zh_CN.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MwmrGv9exRAH"
+      },
+      "source": [
+        "# 基于 MMAction2 进行时空行为检测任务\n",
+        "欢迎使用 MMAction2! 这是一篇关于如何使用 MMAction2 进行时空行为检测的教程。在此教程中，我们会以 MultiSports 数据集为例，提供时空行为检测的完整步骤教程，包括\n",
+        "- 准备时空行为检测数据集\n",
+        "- 训练检测模型\n",
+        "- 准备 AVA 格式的数据集\n",
+        "- 训练时空行为检测模型\n"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "em5lgDTUxRAI"
+      },
+      "source": [
+        "## 0. 安装 MMAction2 和 MMDetection"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bBM9DCrsxRAJ",
+        "outputId": "b310311f-f05e-4a5c-b6e5-8e6ee7e0dfae"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting openmim\n",
+            "  Downloading openmim-0.3.7-py2.py3-none-any.whl (51 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.3/51.3 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: Click in /usr/local/lib/python3.10/dist-packages (from openmim) (8.1.3)\n",
+            "Collecting colorama (from openmim)\n",
+            "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+            "Collecting model-index (from openmim)\n",
+            "  Downloading model_index-0.1.11-py3-none-any.whl (34 kB)\n",
+            "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from openmim) (1.5.3)\n",
+            "Requirement already satisfied: pip>=19.3 in /usr/local/lib/python3.10/dist-packages (from openmim) (23.1.2)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from openmim) (2.27.1)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from openmim) (13.3.4)\n",
+            "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from openmim) (0.8.10)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (6.0)\n",
+            "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from model-index->openmim) (3.4.3)\n",
+            "Collecting ordered-set (from model-index->openmim)\n",
+            "  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n",
+            "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2.8.2)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (2022.7.1)\n",
+            "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas->openmim) (1.22.4)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (1.26.15)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2022.12.7)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (2.0.12)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->openmim) (3.4)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->openmim) (2.14.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->openmim) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->openmim) (1.16.0)\n",
+            "Installing collected packages: ordered-set, colorama, model-index, openmim\n",
+            "Successfully installed colorama-0.4.6 model-index-0.1.11 openmim-0.3.7 ordered-set-4.1.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmengine\n",
+            "  Downloading mmengine-0.7.4-py3-none-any.whl (374 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m374.3/374.3 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting addict (from mmengine)\n",
+            "  Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmengine) (1.22.4)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmengine) (6.0)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine) (2.3.0)\n",
+            "Collecting yapf (from mmengine)\n",
+            "  Downloading yapf-0.40.0-py3-none-any.whl (250 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.3/250.3 kB\u001b[0m \u001b[31m29.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmengine) (4.7.0.72)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (1.4.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (23.1)\n",
+            "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (8.4.0)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine) (2.14.0)\n",
+            "Collecting importlib-metadata>=6.6.0 (from yapf->mmengine)\n",
+            "  Downloading importlib_metadata-6.6.0-py3-none-any.whl (22 kB)\n",
+            "Collecting platformdirs>=3.5.1 (from yapf->mmengine)\n",
+            "  Downloading platformdirs-3.5.3-py3-none-any.whl (15 kB)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmengine) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmengine) (3.15.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine) (1.16.0)\n",
+            "Installing collected packages: addict, platformdirs, importlib-metadata, yapf, mmengine\n",
+            "  Attempting uninstall: platformdirs\n",
+            "    Found existing installation: platformdirs 3.3.0\n",
+            "    Uninstalling platformdirs-3.3.0:\n",
+            "      Successfully uninstalled platformdirs-3.3.0\n",
+            "Successfully installed addict-2.4.0 importlib-metadata-6.6.0 mmengine-0.7.4 platformdirs-3.5.3 yapf-0.40.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmcv\n",
+            "  Downloading https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl (74.4 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.4/74.4 MB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv) (2.4.0)\n",
+            "Requirement already satisfied: mmengine>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.7.4)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv) (1.22.4)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv) (23.1)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv) (8.4.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv) (6.0)\n",
+            "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv) (0.40.0)\n",
+            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv) (4.7.0.72)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (3.7.1)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine>=0.2.0->mmcv) (2.3.0)\n",
+            "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (6.6.0)\n",
+            "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (3.5.3)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv) (3.15.0)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (1.4.4)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmengine>=0.2.0->mmcv) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine>=0.2.0->mmcv) (2.14.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine>=0.2.0->mmcv) (0.1.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmengine>=0.2.0->mmcv) (1.16.0)\n",
+            "Installing collected packages: mmcv\n",
+            "Successfully installed mmcv-2.0.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html\n",
+            "Collecting mmdet\n",
+            "  Downloading mmdet-3.0.0-py3-none-any.whl (1.7 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmdet) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.22.4)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.6)\n",
+            "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.10.1)\n",
+            "Requirement already satisfied: shapely in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.1)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from mmdet) (1.16.0)\n",
+            "Collecting terminaltables (from mmdet)\n",
+            "  Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)\n",
+            "Requirement already satisfied: mmcv<2.1.0,>=2.0.0rc4 in /usr/local/lib/python3.10/dist-packages (from mmdet) (2.0.0)\n",
+            "Requirement already satisfied: mmengine<1.0.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from mmdet) (0.7.4)\n",
+            "Requirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.4.0)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (23.1)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (8.4.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.0)\n",
+            "Requirement already satisfied: yapf in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (0.40.0)\n",
+            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.10/dist-packages (from mmcv<2.1.0,>=2.0.0rc4->mmdet) (4.7.0.72)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (13.3.4)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from mmengine<1.0.0,>=0.7.1->mmdet) (2.3.0)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (1.4.4)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmdet) (2.8.2)\n",
+            "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.2.0)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->mmengine<1.0.0,>=0.7.1->mmdet) (2.14.0)\n",
+            "Requirement already satisfied: importlib-metadata>=6.6.0 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (6.6.0)\n",
+            "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.5.3)\n",
+            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (2.0.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv<2.1.0,>=2.0.0rc4->mmdet) (3.15.0)\n",
+            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->mmengine<1.0.0,>=0.7.1->mmdet) (0.1.2)\n",
+            "Installing collected packages: terminaltables, mmdet\n",
+            "Successfully installed mmdet-3.0.0 terminaltables-3.1.10\n",
+            "Cloning into 'mmaction2'...\n",
+            "remote: Enumerating objects: 22869, done.\u001b[K\n",
+            "remote: Counting objects: 100% (1491/1491), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (801/801), done.\u001b[K\n",
+            "remote: Total 22869 (delta 854), reused 1171 (delta 685), pack-reused 21378\u001b[K\n",
+            "Receiving objects: 100% (22869/22869), 82.81 MiB | 27.92 MiB/s, done.\n",
+            "Resolving deltas: 100% (15952/15952), done.\n",
+            "/content/mmaction2\n",
+            "Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Obtaining file:///content/mmaction2\n",
+            "  Running command python setup.py egg_info\n",
+            "  running egg_info\n",
+            "  creating /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info\n",
+            "  writing /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/PKG-INFO\n",
+            "  writing dependency_links to /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/dependency_links.txt\n",
+            "  writing requirements to /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/requires.txt\n",
+            "  writing top-level names to /tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/top_level.txt\n",
+            "  writing manifest file '/tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/SOURCES.txt'\n",
+            "  reading manifest file '/tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/SOURCES.txt'\n",
+            "  reading manifest template 'MANIFEST.in'\n",
+            "  warning: no files found matching 'mmaction/.mim/model-index.yml'\n",
+            "  warning: no files found matching '*.py' under directory 'mmaction/.mim/configs'\n",
+            "  warning: no files found matching '*.yml' under directory 'mmaction/.mim/configs'\n",
+            "  warning: no files found matching '*.sh' under directory 'mmaction/.mim/tools'\n",
+            "  warning: no files found matching '*.py' under directory 'mmaction/.mim/tools'\n",
+            "  adding license file 'LICENSE'\n",
+            "  writing manifest file '/tmp/pip-pip-egg-info-x4x7terp/mmaction2.egg-info/SOURCES.txt'\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting decord>=0.4.1 (from mmaction2==1.0.0)\n",
+            "  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.6/13.6 MB\u001b[0m \u001b[31m98.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting einops (from mmaction2==1.0.0)\n",
+            "  Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (3.7.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.22.4)\n",
+            "Requirement already satisfied: opencv-contrib-python in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (4.7.0.72)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (8.4.0)\n",
+            "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (1.10.1)\n",
+            "Requirement already satisfied: torch>=1.3 in /usr/local/lib/python3.10/dist-packages (from mmaction2==1.0.0) (2.0.1+cu118)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.12.0)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (4.5.0)\n",
+            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (1.11.1)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1)\n",
+            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (3.1.2)\n",
+            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.3->mmaction2==1.0.0) (2.0.0)\n",
+            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (3.25.2)\n",
+            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.3->mmaction2==1.0.0) (16.0.5)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.0.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (0.11.0)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (4.39.3)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (1.4.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (23.1)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->mmaction2==1.0.0) (2.8.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->mmaction2==1.0.0) (1.16.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.3->mmaction2==1.0.0) (2.1.2)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.3->mmaction2==1.0.0) (1.3.0)\n",
+            "Installing collected packages: einops, decord, mmaction2\n",
+            "  Running setup.py develop for mmaction2\n",
+            "    Running command python setup.py develop\n",
+            "    running develop\n",
+            "    /usr/local/lib/python3.10/dist-packages/setuptools/command/develop.py:40: EasyInstallDeprecationWarning: easy_install command is deprecated.\n",
+            "    !!\n",
+            "\n",
+            "            ********************************************************************************\n",
+            "            Please avoid running ``setup.py`` and ``easy_install``.\n",
+            "            Instead, use pypa/build, pypa/installer, pypa/build or\n",
+            "            other standards-based tools.\n",
+            "\n",
+            "            See https://github.com/pypa/setuptools/issues/917 for details.\n",
+            "            ********************************************************************************\n",
+            "\n",
+            "    !!\n",
+            "      easy_install.initialize_options(self)\n",
+            "    /usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated.\n",
+            "    !!\n",
+            "\n",
+            "            ********************************************************************************\n",
+            "            Please avoid running ``setup.py`` directly.\n",
+            "            Instead, use pypa/build, pypa/installer, pypa/build or\n",
+            "            other standards-based tools.\n",
+            "\n",
+            "            See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.\n",
+            "            ********************************************************************************\n",
+            "\n",
+            "    !!\n",
+            "      self.initialize_options()\n",
+            "    running egg_info\n",
+            "    creating mmaction2.egg-info\n",
+            "    writing mmaction2.egg-info/PKG-INFO\n",
+            "    writing dependency_links to mmaction2.egg-info/dependency_links.txt\n",
+            "    writing requirements to mmaction2.egg-info/requires.txt\n",
+            "    writing top-level names to mmaction2.egg-info/top_level.txt\n",
+            "    writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    reading manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    reading manifest template 'MANIFEST.in'\n",
+            "    adding license file 'LICENSE'\n",
+            "    writing manifest file 'mmaction2.egg-info/SOURCES.txt'\n",
+            "    running build_ext\n",
+            "    Creating /usr/local/lib/python3.10/dist-packages/mmaction2.egg-link (link to .)\n",
+            "    Adding mmaction2 1.0.0 to easy-install.pth file\n",
+            "\n",
+            "    Installed /content/mmaction2\n",
+            "Successfully installed decord-0.6.0 einops-0.6.1 mmaction2-1.0.0\n",
+            "/content/mmaction2/projects/stad_tutorial\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install -U openmim\n",
+        "!mim install mmengine\n",
+        "!mim install mmcv\n",
+        "!mim install mmdet\n",
+        "\n",
+        "!git clone https://github.com/open-mmlab/mmaction2.git\n",
+        "\n",
+        "%cd mmaction2\n",
+        "%pip install -v -e .\n",
+        "%cd projects/stad_tutorial"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4M1PQASJxRAM"
+      },
+      "source": [
+        "## 1. 准备时空行为检测数据集\n",
+        "\n",
+        "类似于检测任务需要提供检测框标注，时空行为检测任务需要对时间和空间同时定位，所以需要提供更复杂的 tube 标注。以 MultiSports 数据集的标注为例，`gttubes` 字段提供了视频中所有的目标动作标注，以下为一个标注片段：\n",
+        "\n",
+        "```\n",
+        "    'gttubes': {\n",
+        "        'aerobic_gymnastics/v_aqMgwPExjD0_c001': # video_key\n",
+        "            {\n",
+        "                10: # 类别标号\n",
+        "                    [\n",
+        "                        array([[ 377.,  904.,  316., 1016.,  584.], # 类别 10 的第 1 个 tube,\n",
+        "                               [ 378.,  882.,  315., 1016.,  579.], # shape (n, 5): 表示 n 帧，每帧标注中包括 (帧号，x1，y1, x2, y2)\n",
+        "                               ...\n",
+        "                               [ 398.,  861.,  304.,  954.,  549.]], dtype=float32)，\n",
+        "\n",
+        "                        array([[ 399.,  881.,  308.,  955.,  542.], # 类别 10 的第 2 个 tube\n",
+        "                               [ 400.,  862.,  303.,  988.,  539.],\n",
+        "                               [ 401.,  853.,  292., 1000.,  535.],\n",
+        "                               ...])\n",
+        "                        ...\n",
+        "\n",
+        "                    ] ,\n",
+        "                9: # 类别标号\n",
+        "                    [\n",
+        "                        array(...), # 类别 9 的第 1 个 tube\n",
+        "                        array(...), # 类别 9 的第 2 个 tube\n",
+        "                        ...\n",
+        "                    ]\n",
+        "                ...\n",
+        "            }\n",
+        "    }\n",
+        "```\n",
+        "\n",
+        "标注文件中还需要提供其他字段的信息，完整的真值文件包括以下信息：\n",
+        "```\n",
+        "{\n",
+        "    'labels':  # 标签列表\n",
+        "        ['aerobic push up', 'aerobic explosive push up', ...],\n",
+        "    'train_videos':  # 训练视频列表\n",
+        "        [\n",
+        "            [\n",
+        "                'aerobic_gymnastics/v_aqMgwPExjD0_c001',\n",
+        "                'aerobic_gymnastics/v_yaKOumdXwbU_c019',\n",
+        "                ...\n",
+        "            ]\n",
+        "        ]\n",
+        "    'test_videos':  # 测试视频列表\n",
+        "        [\n",
+        "            [\n",
+        "                'aerobic_gymnastics/v_crsi07chcV8_c004',\n",
+        "                'aerobic_gymnastics/v_dFYr67eNMwA_c005',\n",
+        "                ...\n",
+        "            ]\n",
+        "        ]\n",
+        "    'n_frames':  # dict 文件，提供各个视频的帧数信息\n",
+        "        {\n",
+        "            'aerobic_gymnastics/v_crsi07chcV8_c004': 725,\n",
+        "            'aerobic_gymnastics/v_dFYr67eNMwA_c005': 750,\n",
+        "            ...\n",
+        "        }\n",
+        "    'resolution':  # dict 文件，提供各个视频的分辨率信息\n",
+        "        {\n",
+        "            'aerobic_gymnastics/v_crsi07chcV8_c004': (720, 1280),\n",
+        "            'aerobic_gymnastics/v_dFYr67eNMwA_c005': (720, 1280),\n",
+        "            ...\n",
+        "        }\n",
+        "    'gt_tubes':  # dict 文件，提供 tube 的检测框信息\n",
+        "        {\n",
+        "            ... # 格式参考上述说明\n",
+        "        }\n",
+        "}\n",
+        "```\n",
+        "后续的实验基于 MultiSports-tiny 进行，我们从 MultiSports 中抽取了少量视频，用于演示整个流程。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "fiJPDuR9xRAQ",
+        "outputId": "8b3d8719-a9c0-4a59-d220-a3626fa34d3b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "--2023-06-15 06:41:29--  https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n",
+            "Resolving download.openmmlab.com (download.openmmlab.com)... 8.48.85.214, 8.48.85.207, 8.48.85.208, ...\n",
+            "Connecting to download.openmmlab.com (download.openmmlab.com)|8.48.85.214|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 82780160 (79M) [application/x-tar]\n",
+            "Saving to: ‘data/multisports-tiny.tar’\n",
+            "\n",
+            "multisports-tiny.ta 100%[===================>]  78.95M  27.9MB/s    in 2.8s    \n",
+            "\n",
+            "2023-06-15 06:41:32 (27.9 MB/s) - ‘data/multisports-tiny.tar’ saved [82780160/82780160]\n",
+            "\n",
+            "multisports-tiny/multisports/\n",
+            "multisports-tiny/multisports/test/\n",
+            "multisports-tiny/multisports/test/aerobic_gymnastics/\n",
+            "multisports-tiny/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4\n",
+            "multisports-tiny/multisports/annotations/\n",
+            "multisports-tiny/multisports/annotations/multisports_GT.pkl\n",
+            "multisports-tiny/multisports/trainval/\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c001.mp4\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c003.mp4\n",
+            "multisports-tiny/multisports/trainval/aerobic_gymnastics/v__wAgwttPYaQ_c002.mp4\n",
+            "Reading package lists...\n",
+            "Building dependency tree...\n",
+            "Reading state information...\n",
+            "The following NEW packages will be installed:\n",
+            "  tree\n",
+            "0 upgraded, 1 newly installed, 0 to remove and 46 not upgraded.\n",
+            "Need to get 43.0 kB of archives.\n",
+            "After this operation, 115 kB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tree amd64 1.8.0-1 [43.0 kB]\n",
+            "Fetched 43.0 kB in 0s (253 kB/s)\n",
+            "Selecting previously unselected package tree.\n",
+            "(Reading database ... 122541 files and directories currently installed.)\n",
+            "Preparing to unpack .../tree_1.8.0-1_amd64.deb ...\n",
+            "Unpacking tree (1.8.0-1) ...\n",
+            "Setting up tree (1.8.0-1) ...\n",
+            "Processing triggers for man-db (2.9.1-1) ...\n",
+            "\u001b[01;34mdata\u001b[00m\n",
+            "├── \u001b[01;34mmultisports\u001b[00m\n",
+            "│   ├── \u001b[01;34mannotations\u001b[00m\n",
+            "│   │   └── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "│   ├── \u001b[01;34mtest\u001b[00m\n",
+            "│   │   └── \u001b[01;34maerobic_gymnastics\u001b[00m\n",
+            "│   │       └── \u001b[01;32mv_7G_IpU0FxLU_c001.mp4\u001b[00m\n",
+            "│   └── \u001b[01;34mtrainval\u001b[00m\n",
+            "│       └── \u001b[01;34maerobic_gymnastics\u001b[00m\n",
+            "│           ├── \u001b[01;32mv__wAgwttPYaQ_c001.mp4\u001b[00m\n",
+            "│           ├── \u001b[01;32mv__wAgwttPYaQ_c002.mp4\u001b[00m\n",
+            "│           └── \u001b[01;32mv__wAgwttPYaQ_c003.mp4\u001b[00m\n",
+            "└── \u001b[01;31mmultisports-tiny.tar\u001b[00m\n",
+            "\n",
+            "6 directories, 6 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "# 下载数据集\n",
+        "!wget -P data -c https://download.openmmlab.com/mmaction/v1.0/projects/stad_tutorial/multisports-tiny.tar\n",
+        "!tar -xvf data/multisports-tiny.tar --strip 1 -C data\n",
+        "!apt-get -q install tree\n",
+        "!tree data"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XjG0dEE8xRAS"
+      },
+      "source": [
+        "## 2. 训练检测模型\n",
+        "\n",
+        "在 SlowOnly + Det 的范式中，需要先训练人体检测器，再基于检测结果来预测行为。这一节中，我们基于上一节中的标注格式和 MMDetection 算法库训练检测模型。\n",
+        "\n",
+        "### 2.1 构建检测数据集标注（COCO 格式）\n",
+        "\n",
+        "基于时空行为检测数据集的标注信息，我们可以构建一个 COCO 格式的检测数据集，用于训练检测模型。我们提供了一个工具脚本对 MultiSports 格式的标注进行转换，如果需要基于其他格式转换，可以参考 MMDetection 提供的[自定义数据集](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/customize_dataset.html)文档。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "inBtClMIxRAV",
+        "outputId": "3ac5199b-562f-48c4-da27-819d34069213"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\u001b[01;34mdata/multisports/annotations\u001b[00m\n",
+            "├── multisports_det_anno_train.json\n",
+            "├── multisports_det_anno_val.json\n",
+            "└── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "\n",
+            "0 directories, 3 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python tools/generate_mmdet_anno.py data/multisports/annotations/multisports_GT.pkl data/multisports/annotations/multisports_det_anno.json\n",
+        "!tree data/multisports/annotations"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "TkPONRezxRAZ",
+        "outputId": "0f8075a1-47fb-490d-9c88-4904f45363fb"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Will generate 3 rgb dir for aerobic_gymnastics.\n",
+            "Generate v__wAgwttPYaQ_c003 rgb dir successfully.\n",
+            "Generate v__wAgwttPYaQ_c002 rgb dir successfully.\n",
+            "Generate v__wAgwttPYaQ_c001 rgb dir successfully.\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python tools/generate_rgb.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MP-umqqnxRAa"
+      },
+      "source": [
+        "### 2.2 修改 config 文件\n",
+        "\n",
+        "我们以 faster-rcnn_x101-64x4d_fpn_1x_coco 为基础配置，做如下修改，在 MultiSports 数据集上进行训练。需要修改以下几个部分：\n",
+        "- 模型的类别数量\n",
+        "- 学习率调整策略\n",
+        "- 优化器配置\n",
+        "- 数据集/标注文件路径\n",
+        "- 评测器配置\n",
+        "- 预训练模型\n",
+        "\n",
+        "更详细的教程可以参考 MMDetection 提供的[准备配置文件](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/train.html#id9)文档。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yMw9MrI0xRAc",
+        "outputId": "1f5ee99a-d4cb-45b0-df71-f0209a9b6275"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "# Copyright (c) OpenMMLab. All rights reserved.\n",
+            "_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'\n",
+            "model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))\n",
+            "\n",
+            "# take 2 epochs as an example\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "\n",
+            "# learning rate\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "\n",
+            "# optimizer\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.0050, momentum=0.9, weight_decay=0.0001))\n",
+            "\n",
+            "dataset_type = 'CocoDataset'\n",
+            "# modify metainfo\n",
+            "metainfo = {\n",
+            "    'classes': ('person', ),\n",
+            "    'palette': [\n",
+            "        (220, 20, 60),\n",
+            "    ]\n",
+            "}\n",
+            "\n",
+            "# specify metainfo, dataset path\n",
+            "data_root = 'data/multisports/'\n",
+            "\n",
+            "train_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "val_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "test_dataloader = dict(\n",
+            "    dataset=dict(\n",
+            "        data_root=data_root,\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        metainfo=metainfo))\n",
+            "\n",
+            "# specify annotaition file path, modify metric items\n",
+            "val_evaluator = dict(\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5],\n",
+            ")\n",
+            "\n",
+            "test_evaluator = dict(\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5],\n",
+            ")\n",
+            "\n",
+            "# specify pretrain checkpoint\n",
+            "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501\n"
+          ]
+        }
+      ],
+      "source": [
+        "!cat configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "S3Ux8echxRAe"
+      },
+      "source": [
+        "### 2.3 训练检测模型"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MYtjYFU5xRAf"
+      },
+      "source": [
+        "利用 MIM 可以在当前路径直接训练 MMDetection 模型，这里提供最简单的单卡训练示例，更多训练命令可以参考 MIM [教程](https://github.com/open-mmlab/mim#command)。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "56m--2T8xRAg",
+        "outputId": "d47ceca0-e930-4063-e25d-739a44410b86"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/train.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py --launcher none --work-dir work_dirs/det_model. \n",
+            "06/15 06:42:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 1318688827\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 1318688827\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:42:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "model = dict(\n",
+            "    type='FasterRCNN',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[103.53, 116.28, 123.675],\n",
+            "        std=[1.0, 1.0, 1.0],\n",
+            "        bgr_to_rgb=False,\n",
+            "        pad_size_divisor=32),\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=False),\n",
+            "        norm_eval=True,\n",
+            "        style='caffe',\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained',\n",
+            "            checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0]),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2]),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)))\n",
+            "dataset_type = 'CocoDataset'\n",
+            "data_root = 'data/multisports/'\n",
+            "backend_args = None\n",
+            "train_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='RandomChoiceResize',\n",
+            "        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                (1333, 768), (1333, 800)],\n",
+            "        keep_ratio=True),\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='RandomChoiceResize',\n",
+            "                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                        (1333, 768), (1333, 800)],\n",
+            "                keep_ratio=True),\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "test_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "val_cfg = dict(type='ValLoop')\n",
+            "test_cfg = dict(type='TestLoop')\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n",
+            "auto_scale_lr = dict(enable=False, base_batch_size=16)\n",
+            "default_scope = 'mmdet'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='DetVisualizationHook'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='visualizer')\n",
+            "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'\n",
+            "resume = False\n",
+            "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n",
+            "launcher = 'none'\n",
+            "work_dir = 'work_dirs/det_model'\n",
+            "\n",
+            "06/15 06:42:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:42:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "06/15 06:42:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: open-mmlab://detectron2/resnet50_caffe\n",
+            "06/15 06:42:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by openmmlab backend from path: open-mmlab://detectron2/resnet50_caffe\n",
+            "Downloading: \"https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth\" to /root/.cache/torch/hub/checkpoints/resnet50_msra-5891d200.pth\n",
+            "100% 89.9M/89.9M [00:03<00:00, 31.4MB/s]\n",
+            "06/15 06:42:53 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: conv1.bias\n",
+            "\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\" to /root/.cache/torch/hub/checkpoints/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "100% 158M/158M [00:06<00:00, 24.4MB/s]\n",
+            "06/15 06:43:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth\n",
+            "06/15 06:43:00 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+            "06/15 06:43:00 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n",
+            "06/15 06:43:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/det_model.\n",
+            "06/15 06:43:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 50/118]  lr: 5.0000e-03  eta: 0:02:00  time: 0.6468  data_time: 0.0127  memory: 3419  loss: 0.4823  loss_rpn_cls: 0.0063  loss_rpn_bbox: 0.0151  loss_cls: 0.1676  acc: 95.0195  loss_bbox: 0.2933\n",
+            "06/15 06:43:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118]  lr: 5.0000e-03  eta: 0:01:17  time: 0.4922  data_time: 0.0077  memory: 3419  loss: 0.4234  loss_rpn_cls: 0.0031  loss_rpn_bbox: 0.0134  loss_cls: 0.1394  acc: 91.9922  loss_bbox: 0.2676\n",
+            "06/15 06:44:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_064239\n",
+            "06/15 06:44:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "06/15 06:44:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 50/120]    eta: 0:00:08  time: 0.1269  data_time: 0.0112  memory: 3419  \n",
+            "06/15 06:44:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120]    eta: 0:00:02  time: 0.1159  data_time: 0.0032  memory: 682  \n",
+            "06/15 06:44:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.04s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.01s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.913\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.817\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.908\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = 0.960\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.960\n",
+            "06/15 06:44:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.913 -1.000 -1.000 -1.000 0.817 0.908\n",
+            "06/15 06:44:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: 0.9600  data_time: 0.0065  time: 0.1205\n",
+            "06/15 06:44:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 50/118]  lr: 5.0000e-03  eta: 0:00:37  time: 0.5233  data_time: 0.0099  memory: 3419  loss: 0.3250  loss_rpn_cls: 0.0025  loss_rpn_bbox: 0.0107  loss_cls: 0.1116  acc: 95.2148  loss_bbox: 0.2002\n",
+            "06/15 06:45:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118]  lr: 5.0000e-03  eta: 0:00:09  time: 0.5354  data_time: 0.0083  memory: 3419  loss: 0.3042  loss_rpn_cls: 0.0013  loss_rpn_bbox: 0.0105  loss_cls: 0.0946  acc: 94.9219  loss_bbox: 0.1978\n",
+            "06/15 06:45:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person_20230615_064239\n",
+            "06/15 06:45:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n",
+            "06/15 06:45:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 50/120]    eta: 0:00:08  time: 0.1237  data_time: 0.0050  memory: 3419  \n",
+            "06/15 06:45:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120]    eta: 0:00:02  time: 0.1225  data_time: 0.0058  memory: 682  \n",
+            "06/15 06:45:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.07s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.01s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.912\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 0.747\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.916\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = 0.955\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = 0.955\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = 0.955\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = 1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = 0.954\n",
+            "06/15 06:45:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: 0.912 -1.000 -1.000 -1.000 0.747 0.916\n",
+            "06/15 06:45:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: 0.9550  data_time: 0.0052  time: 0.1228\n",
+            "\u001b[32mTraining finished successfully. \u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "!mim train mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --work-dir work_dirs/det_model"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-pf9MnuUxRAh"
+      },
+      "source": [
+        "### 2.4 生成 proposal bboxes\n",
+        "\n",
+        "在时空行为检测模型训练时，需要基于检测模型推理得到的 proposal，而不能基于标注的检测框。因此我们需要利用训练好的检测模型对整个数据集进行推理，得到 proposal 后转换为需要的格式，用于后续训练。\n",
+        "\n",
+        "#### 2.4.1 将数据集转换为 Coco 格式\n",
+        "\n",
+        "我们提供了脚本将 MultiSports 数据集转换成没有真值的标注格式，用于推理。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "nL2n0AKJxRAi",
+        "outputId": "51907af1-7da3-4713-8e90-a61b052000aa"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[>>] 2350/2350, 1606.7 task/s, elapsed: 1s, ETA:     0s\n",
+            "save json file: data/multisports/rawframes/../annotations/ms_infer_anno.json\n"
+          ]
+        }
+      ],
+      "source": [
+        "!echo 'person' > data/multisports/annotations/label_map.txt\n",
+        "!python tools/images2coco.py \\\n",
+        "        data/multisports/rawframes \\\n",
+        "        data/multisports/annotations/label_map.txt \\\n",
+        "        ms_infer_anno.json"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_REQniysxRAj"
+      },
+      "source": [
+        "#### 2.4.2 推理生成 proposal file"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ShnTsjs1xRAk"
+      },
+      "source": [
+        "MMDetection 模型的推理同样基于 MIM，更多测试命令请参考 MIM [教程](https://github.com/open-mmlab/mim#command)。\n",
+        "\n",
+        "推理完成后，会将推理结果保存在 'data/multisports/ms_proposals.pkl'。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DXnT4aArxRAm",
+        "outputId": "565faf02-4b7f-49ab-f30f-b20e7eb09085"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Testing command is /usr/bin/python3 /usr/local/lib/python3.10/dist-packages/mmdet/.mim/tools/test.py configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py work_dirs/det_model/epoch_2.pth --launcher none --out data/multisports/annotations/ms_det_proposals.pkl. \n",
+            "06/15 06:45:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 1403639615\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 1403639615\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:45:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "model = dict(\n",
+            "    type='FasterRCNN',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[103.53, 116.28, 123.675],\n",
+            "        std=[1.0, 1.0, 1.0],\n",
+            "        bgr_to_rgb=False,\n",
+            "        pad_size_divisor=32),\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=False),\n",
+            "        norm_eval=True,\n",
+            "        style='caffe',\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained',\n",
+            "            checkpoint='open-mmlab://detectron2/resnet50_caffe')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0]),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2]),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)))\n",
+            "dataset_type = 'CocoDataset'\n",
+            "data_root = 'data/multisports/'\n",
+            "backend_args = None\n",
+            "train_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='RandomChoiceResize',\n",
+            "        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                (1333, 768), (1333, 800)],\n",
+            "        keep_ratio=True),\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', backend_args=None),\n",
+            "    dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_train.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='RandomChoiceResize',\n",
+            "                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),\n",
+            "                        (1333, 768), (1333, 800)],\n",
+            "                keep_ratio=True),\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/multisports_det_anno_val.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False),\n",
+            "    dataset=dict(\n",
+            "        type='CocoDataset',\n",
+            "        data_root='data/multisports/',\n",
+            "        ann_file='annotations/ms_infer_anno.json',\n",
+            "        data_prefix=dict(img='rawframes/'),\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', backend_args=None),\n",
+            "            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='PackDetInputs',\n",
+            "                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                           'scale_factor'))\n",
+            "        ],\n",
+            "        backend_args=None,\n",
+            "        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))\n",
+            "val_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_det_anno_val.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "test_evaluator = dict(\n",
+            "    type='CocoMetric',\n",
+            "    ann_file='data/multisports/annotations/ms_infer_anno.json',\n",
+            "    metric='bbox',\n",
+            "    format_only=False,\n",
+            "    backend_args=None,\n",
+            "    metric_items=['mAP_50', 'AR@100'],\n",
+            "    iou_thrs=[0.5])\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_interval=1)\n",
+            "val_cfg = dict(type='ValLoop')\n",
+            "test_cfg = dict(type='TestLoop')\n",
+            "param_scheduler = [\n",
+            "    dict(type='ConstantLR', factor=1.0, by_epoch=False, begin=0, end=500)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))\n",
+            "auto_scale_lr = dict(enable=False, base_batch_size=16)\n",
+            "default_scope = 'mmdet'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='DetVisualizationHook'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='visualizer')\n",
+            "log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'work_dirs/det_model/epoch_2.pth'\n",
+            "resume = False\n",
+            "metainfo = dict(classes=('person', ), palette=[(220, 20, 60)])\n",
+            "launcher = 'none'\n",
+            "work_dir = './work_dirs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person'\n",
+            "\n",
+            "06/15 06:45:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:45:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DetVisualizationHook               \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.00s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "06/15 06:45:56 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The prefix is not set in metric class DumpDetResults.\n",
+            "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n",
+            "06/15 06:45:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from work_dirs/det_model/epoch_2.pth\n",
+            "06/15 06:46:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [  50/2350]    eta: 0:05:46  time: 0.1507  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:46:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 100/2350]    eta: 0:05:06  time: 0.1217  data_time: 0.0059  memory: 512  \n",
+            "06/15 06:46:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 150/2350]    eta: 0:04:47  time: 0.1193  data_time: 0.0022  memory: 512  \n",
+            "06/15 06:46:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 200/2350]    eta: 0:04:34  time: 0.1197  data_time: 0.0023  memory: 512  \n",
+            "06/15 06:46:29 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 250/2350]    eta: 0:04:27  time: 0.1258  data_time: 0.0073  memory: 512  \n",
+            "06/15 06:46:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 300/2350]    eta: 0:04:19  time: 0.1215  data_time: 0.0026  memory: 512  \n",
+            "06/15 06:46:41 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 350/2350]    eta: 0:04:12  time: 0.1242  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:46:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 400/2350]    eta: 0:04:04  time: 0.1218  data_time: 0.0029  memory: 512  \n",
+            "06/15 06:46:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 450/2350]    eta: 0:03:58  time: 0.1229  data_time: 0.0042  memory: 512  \n",
+            "06/15 06:46:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 500/2350]    eta: 0:03:51  time: 0.1229  data_time: 0.0048  memory: 512  \n",
+            "06/15 06:47:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 550/2350]    eta: 0:03:44  time: 0.1193  data_time: 0.0020  memory: 512  \n",
+            "06/15 06:47:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 600/2350]    eta: 0:03:37  time: 0.1234  data_time: 0.0060  memory: 512  \n",
+            "06/15 06:47:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 650/2350]    eta: 0:03:30  time: 0.1184  data_time: 0.0025  memory: 512  \n",
+            "06/15 06:47:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 700/2350]    eta: 0:03:24  time: 0.1200  data_time: 0.0041  memory: 512  \n",
+            "06/15 06:47:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 750/2350]    eta: 0:03:17  time: 0.1216  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:47:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 800/2350]    eta: 0:03:11  time: 0.1184  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:47:42 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 850/2350]    eta: 0:03:04  time: 0.1234  data_time: 0.0064  memory: 512  \n",
+            "06/15 06:47:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 900/2350]    eta: 0:02:58  time: 0.1196  data_time: 0.0028  memory: 512  \n",
+            "06/15 06:47:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [ 950/2350]    eta: 0:02:52  time: 0.1217  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:48:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1000/2350]    eta: 0:02:45  time: 0.1220  data_time: 0.0046  memory: 512  \n",
+            "06/15 06:48:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1050/2350]    eta: 0:02:39  time: 0.1203  data_time: 0.0028  memory: 512  \n",
+            "06/15 06:48:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1100/2350]    eta: 0:02:33  time: 0.1231  data_time: 0.0055  memory: 512  \n",
+            "06/15 06:48:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1150/2350]    eta: 0:02:27  time: 0.1207  data_time: 0.0033  memory: 512  \n",
+            "06/15 06:48:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1200/2350]    eta: 0:02:21  time: 0.1217  data_time: 0.0049  memory: 512  \n",
+            "06/15 06:48:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1250/2350]    eta: 0:02:14  time: 0.1211  data_time: 0.0038  memory: 512  \n",
+            "06/15 06:48:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1300/2350]    eta: 0:02:08  time: 0.1242  data_time: 0.0070  memory: 512  \n",
+            "06/15 06:48:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1350/2350]    eta: 0:02:02  time: 0.1249  data_time: 0.0077  memory: 512  \n",
+            "06/15 06:48:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1400/2350]    eta: 0:01:56  time: 0.1181  data_time: 0.0022  memory: 512  \n",
+            "06/15 06:48:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1450/2350]    eta: 0:01:50  time: 0.1219  data_time: 0.0055  memory: 512  \n",
+            "06/15 06:49:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1500/2350]    eta: 0:01:44  time: 0.1198  data_time: 0.0034  memory: 512  \n",
+            "06/15 06:49:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1550/2350]    eta: 0:01:37  time: 0.1194  data_time: 0.0028  memory: 512  \n",
+            "06/15 06:49:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1600/2350]    eta: 0:01:31  time: 0.1228  data_time: 0.0059  memory: 512  \n",
+            "06/15 06:49:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1650/2350]    eta: 0:01:25  time: 0.1193  data_time: 0.0026  memory: 512  \n",
+            "06/15 06:49:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1700/2350]    eta: 0:01:19  time: 0.1232  data_time: 0.0060  memory: 512  \n",
+            "06/15 06:49:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1750/2350]    eta: 0:01:13  time: 0.1199  data_time: 0.0028  memory: 512  \n",
+            "06/15 06:49:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1800/2350]    eta: 0:01:07  time: 0.1205  data_time: 0.0035  memory: 512  \n",
+            "06/15 06:49:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1850/2350]    eta: 0:01:01  time: 0.1237  data_time: 0.0067  memory: 512  \n",
+            "06/15 06:49:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1900/2350]    eta: 0:00:54  time: 0.1190  data_time: 0.0024  memory: 512  \n",
+            "06/15 06:49:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [1950/2350]    eta: 0:00:48  time: 0.1238  data_time: 0.0069  memory: 512  \n",
+            "06/15 06:50:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2000/2350]    eta: 0:00:42  time: 0.1183  data_time: 0.0020  memory: 512  \n",
+            "06/15 06:50:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2050/2350]    eta: 0:00:36  time: 0.1212  data_time: 0.0049  memory: 512  \n",
+            "06/15 06:50:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2100/2350]    eta: 0:00:30  time: 0.1212  data_time: 0.0044  memory: 512  \n",
+            "06/15 06:50:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2150/2350]    eta: 0:00:24  time: 0.1180  data_time: 0.0019  memory: 512  \n",
+            "06/15 06:50:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2200/2350]    eta: 0:00:18  time: 0.1233  data_time: 0.0062  memory: 512  \n",
+            "06/15 06:50:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2250/2350]    eta: 0:00:12  time: 0.1186  data_time: 0.0021  memory: 512  \n",
+            "06/15 06:50:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2300/2350]    eta: 0:00:06  time: 0.1227  data_time: 0.0064  memory: 512  \n",
+            "06/15 06:50:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350]    eta: 0:00:00  time: 0.1196  data_time: 0.0033  memory: 512  \n",
+            "06/15 06:50:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Evaluating bbox...\n",
+            "Loading and preparing results...\n",
+            "DONE (t=0.01s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "Running per image evaluation...\n",
+            "Evaluate annotation type *bbox*\n",
+            "DONE (t=0.37s).\n",
+            "Accumulating evaluation results...\n",
+            "DONE (t=0.28s).\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n",
+            " Average Precision  (AP) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=100 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=300 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=   all | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= small | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=1000 ] = -1.000\n",
+            " Average Recall     (AR) @[ IoU=0.50:0.50 | area= large | maxDets=1000 ] = -1.000\n",
+            "06/15 06:50:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - bbox_mAP_copypaste: -1.000 -1.000 -1.000 -1.000 -1.000 -1.000\n",
+            "06/15 06:50:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Results has been saved to data/multisports/annotations/ms_det_proposals.pkl.\n",
+            "06/15 06:50:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(test) [2350/2350]    coco/bbox_mAP_50: -1.0000  coco/bbox_AR@100: -1.0000  data_time: 0.0042  time: 0.1219\n",
+            "\u001b[32mTesting finished successfully.\u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "!mim test mmdet configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --checkpoint work_dirs/det_model/epoch_2.pth \\\n",
+        "    --out data/multisports/annotations/ms_det_proposals.pkl"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1zErF-nsxRAo"
+      },
+      "source": [
+        "## 3. 训练时空行为检测模型\n",
+        "\n",
+        "### 3.1 转换标注文件以及 proposal 文件\n",
+        "\n",
+        "MultiSports 数据集提供的标注文件，以及 MMDetection 推理生成的 proposal 都需要进行格式转换，才能用于时空行为检测模型的训练。我们已经提供了相关的脚本工具，执行后即可生成指定格式"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "naAfcO4QxRAo",
+        "outputId": "2a309bef-241f-44fc-8276-b2ea4735e37d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "loading test result...\n",
+            "[>>] 2350/2350, 3582.6 task/s, elapsed: 1s, ETA:     0s\n",
+            "\u001b[01;34mdata/multisports/annotations\u001b[00m\n",
+            "├── label_map.txt\n",
+            "├── ms_det_proposals.pkl\n",
+            "├── ms_infer_anno.json\n",
+            "├── multisports_det_anno_train.json\n",
+            "├── multisports_det_anno_val.json\n",
+            "├── \u001b[01;32mmultisports_GT.pkl\u001b[00m\n",
+            "├── multisports_proposals_train.pkl\n",
+            "├── multisports_proposals_val.pkl\n",
+            "├── multisports_train.csv\n",
+            "└── multisports_val.csv\n",
+            "\n",
+            "0 directories, 10 files\n"
+          ]
+        }
+      ],
+      "source": [
+        "# 转换 anno 文件\n",
+        "!python ../../tools/data/multisports/parse_anno.py\n",
+        "\n",
+        "# 转换 proposal 文件\n",
+        "!python tools/convert_proposals.py\n",
+        "\n",
+        "!tree data/multisports/annotations"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "balpcJEbxRAp"
+      },
+      "source": [
+        "### 3.2 训练时空行为检测模型\n",
+        "\n",
+        "MMAction2 中已经支持训练 MultiSports 数据集，这里只需要修改 proposal 文件的路径即可, 详细配置可以参考 [config](configs/slowonly_k400_multisports.py) 文件。由于训练数据较少，配置中将在完整 MultiSports 数据集上训练得到的模型作为预训练模型，使用自定义数据集训练时不需要指定 `load_from` 配置。"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "cIuQTmnuxRAq",
+        "outputId": "253d7f08-3c89-4e31-c5f4-3880aed5d817"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training command is /usr/bin/python3 /content/mmaction2/mmaction/.mim/tools/train.py configs/slowonly_k400_multisports.py --launcher none --work-dir work_dirs/stad_model/. \n",
+            "06/15 06:50:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 546414243\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.8, V11.8.89\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\n",
+            "    PyTorch: 2.0.1+cu118\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 9.3\n",
+            "  - C++ Version: 201703\n",
+            "  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.8\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n",
+            "  - CuDNN 8.7\n",
+            "  - Magma 2.6.1\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n",
+            "\n",
+            "    TorchVision: 0.15.2+cu118\n",
+            "    OpenCV: 4.7.0\n",
+            "    MMEngine: 0.7.4\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 546414243\n",
+            "    diff_rank_seed: False\n",
+            "    deterministic: False\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "06/15 06:50:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+            "default_scope = 'mmaction'\n",
+            "default_hooks = dict(\n",
+            "    runtime_info=dict(type='RuntimeInfoHook', _scope_='mmaction'),\n",
+            "    timer=dict(type='IterTimerHook', _scope_='mmaction'),\n",
+            "    logger=dict(\n",
+            "        type='LoggerHook', interval=20, ignore_last=False, _scope_='mmaction'),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook', _scope_='mmaction'),\n",
+            "    checkpoint=dict(\n",
+            "        type='CheckpointHook',\n",
+            "        interval=1,\n",
+            "        save_best='auto',\n",
+            "        _scope_='mmaction'),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook', _scope_='mmaction'),\n",
+            "    sync_buffers=dict(type='SyncBuffersHook', _scope_='mmaction'))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "log_processor = dict(\n",
+            "    type='LogProcessor', window_size=20, by_epoch=True, _scope_='mmaction')\n",
+            "vis_backends = [dict(type='LocalVisBackend', _scope_='mmaction')]\n",
+            "visualizer = dict(\n",
+            "    type='ActionVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    _scope_='mmaction')\n",
+            "log_level = 'INFO'\n",
+            "load_from = 'https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth'\n",
+            "resume = False\n",
+            "url = 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n",
+            "num_classes = 66\n",
+            "model = dict(\n",
+            "    type='FastRCNN',\n",
+            "    _scope_='mmdet',\n",
+            "    init_cfg=dict(\n",
+            "        type='Pretrained',\n",
+            "        checkpoint=\n",
+            "        'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth'\n",
+            "    ),\n",
+            "    backbone=dict(\n",
+            "        type='mmaction.ResNet3dSlowOnly',\n",
+            "        depth=50,\n",
+            "        pretrained=None,\n",
+            "        pretrained2d=False,\n",
+            "        lateral=False,\n",
+            "        num_stages=4,\n",
+            "        conv1_kernel=(1, 7, 7),\n",
+            "        conv1_stride_t=1,\n",
+            "        pool1_stride_t=1,\n",
+            "        spatial_strides=(1, 2, 2, 1)),\n",
+            "    roi_head=dict(\n",
+            "        type='AVARoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor3D',\n",
+            "            roi_layer_type='RoIAlign',\n",
+            "            output_size=8,\n",
+            "            with_temporal_pool=True),\n",
+            "        bbox_head=dict(\n",
+            "            type='BBoxHeadAVA',\n",
+            "            in_channels=2048,\n",
+            "            num_classes=66,\n",
+            "            multilabel=False,\n",
+            "            dropout_ratio=0.5)),\n",
+            "    data_preprocessor=dict(\n",
+            "        type='mmaction.ActionDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        format_shape='NCTHW'),\n",
+            "    train_cfg=dict(\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssignerAVA',\n",
+            "                pos_iou_thr=0.9,\n",
+            "                neg_iou_thr=0.9,\n",
+            "                min_pos_iou=0.9),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=32,\n",
+            "                pos_fraction=1,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=1.0)),\n",
+            "    test_cfg=dict(rcnn=None))\n",
+            "dataset_type = 'AVADataset'\n",
+            "data_root = 'data/multisports/trainval'\n",
+            "anno_root = 'data/multisports/annotations'\n",
+            "ann_file_train = 'data/multisports/annotations/multisports_train.csv'\n",
+            "ann_file_val = 'data/multisports/annotations/multisports_val.csv'\n",
+            "gt_file = 'data/multisports/annotations/multisports_GT.pkl'\n",
+            "proposal_file_train = 'data/multisports/annotations/multisports_proposals_train.pkl'\n",
+            "proposal_file_val = 'data/multisports/annotations/multisports_proposals_val.pkl'\n",
+            "file_client_args = dict(io_backend='disk')\n",
+            "train_pipeline = [\n",
+            "    dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='SampleAVAFrames',\n",
+            "        clip_len=4,\n",
+            "        frame_interval=16,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='DecordDecode', _scope_='mmaction'),\n",
+            "    dict(type='RandomRescale', scale_range=(256, 320), _scope_='mmaction'),\n",
+            "    dict(type='RandomCrop', size=256, _scope_='mmaction'),\n",
+            "    dict(type='Flip', flip_ratio=0.5, _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='FormatShape',\n",
+            "        input_format='NCTHW',\n",
+            "        collapse=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='PackActionInputs', _scope_='mmaction')\n",
+            "]\n",
+            "val_pipeline = [\n",
+            "    dict(type='DecordInit', io_backend='disk', _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='SampleAVAFrames',\n",
+            "        clip_len=4,\n",
+            "        frame_interval=16,\n",
+            "        test_mode=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='DecordDecode', _scope_='mmaction'),\n",
+            "    dict(type='Resize', scale=(-1, 256), _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='FormatShape',\n",
+            "        input_format='NCTHW',\n",
+            "        collapse=True,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(type='PackActionInputs', _scope_='mmaction')\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_train.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='RandomRescale', scale_range=(256, 320)),\n",
+            "            dict(type='RandomCrop', size=256),\n",
+            "            dict(type='Flip', flip_ratio=0.5),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_proposals_train.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "val_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_val.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(\n",
+            "                type='SampleAVAFrames',\n",
+            "                clip_len=4,\n",
+            "                frame_interval=16,\n",
+            "                test_mode=True),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='Resize', scale=(-1, 256)),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_proposals_val.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        test_mode=True,\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=8,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=False, _scope_='mmaction'),\n",
+            "    dataset=dict(\n",
+            "        type='AVADataset',\n",
+            "        ann_file='data/multisports/annotations/multisports_val.csv',\n",
+            "        pipeline=[\n",
+            "            dict(type='DecordInit', io_backend='disk'),\n",
+            "            dict(\n",
+            "                type='SampleAVAFrames',\n",
+            "                clip_len=4,\n",
+            "                frame_interval=16,\n",
+            "                test_mode=True),\n",
+            "            dict(type='DecordDecode'),\n",
+            "            dict(type='Resize', scale=(-1, 256)),\n",
+            "            dict(type='FormatShape', input_format='NCTHW', collapse=True),\n",
+            "            dict(type='PackActionInputs')\n",
+            "        ],\n",
+            "        num_classes=66,\n",
+            "        proposal_file=\n",
+            "        'data/multisports/annotations/multisports_dense_proposals_val.recall_96.13.pkl',\n",
+            "        data_prefix=dict(img='data/multisports/trainval'),\n",
+            "        test_mode=True,\n",
+            "        timestamp_start=1,\n",
+            "        start_index=0,\n",
+            "        use_frames=False,\n",
+            "        fps=1,\n",
+            "        _scope_='mmaction'))\n",
+            "val_evaluator = dict(\n",
+            "    type='MultiSportsMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_GT.pkl',\n",
+            "    _scope_='mmaction')\n",
+            "test_evaluator = dict(\n",
+            "    type='MultiSportsMetric',\n",
+            "    ann_file='data/multisports/annotations/multisports_GT.pkl',\n",
+            "    _scope_='mmaction')\n",
+            "train_cfg = dict(\n",
+            "    type='EpochBasedTrainLoop',\n",
+            "    max_epochs=8,\n",
+            "    val_begin=1,\n",
+            "    val_interval=1,\n",
+            "    _scope_='mmaction')\n",
+            "val_cfg = dict(type='ValLoop', _scope_='mmaction')\n",
+            "test_cfg = dict(type='TestLoop', _scope_='mmaction')\n",
+            "param_scheduler = [\n",
+            "    dict(\n",
+            "        type='LinearLR',\n",
+            "        start_factor=0.1,\n",
+            "        by_epoch=True,\n",
+            "        begin=0,\n",
+            "        end=5,\n",
+            "        _scope_='mmaction'),\n",
+            "    dict(\n",
+            "        type='MultiStepLR',\n",
+            "        begin=0,\n",
+            "        end=8,\n",
+            "        by_epoch=True,\n",
+            "        milestones=[6, 7],\n",
+            "        gamma=0.1,\n",
+            "        _scope_='mmaction')\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    optimizer=dict(\n",
+            "        type='SGD',\n",
+            "        lr=0.01,\n",
+            "        momentum=0.9,\n",
+            "        weight_decay=1e-05,\n",
+            "        _scope_='mmaction'),\n",
+            "    clip_grad=dict(max_norm=5, norm_type=2))\n",
+            "launcher = 'none'\n",
+            "work_dir = 'work_dirs/stad_model/'\n",
+            "randomness = dict(seed=None, diff_rank_seed=False, deterministic=False)\n",
+            "\n",
+            "06/15 06:51:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "06/15 06:51:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+            "before_run:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "before_train:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_train_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) DistSamplerSeedHook                \n",
+            " -------------------- \n",
+            "before_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_train_iter:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) SyncBuffersHook                    \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_val_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(NORMAL      ) SyncBuffersHook                    \n",
+            " -------------------- \n",
+            "before_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_val_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_val_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            "(LOW         ) ParamSchedulerHook                 \n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "after_train:\n",
+            "(VERY_LOW    ) CheckpointHook                     \n",
+            " -------------------- \n",
+            "before_test_epoch:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "before_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            " -------------------- \n",
+            "after_test_iter:\n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_test_epoch:\n",
+            "(VERY_HIGH   ) RuntimeInfoHook                    \n",
+            "(NORMAL      ) IterTimerHook                      \n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "after_run:\n",
+            "(BELOW_NORMAL) LoggerHook                         \n",
+            " -------------------- \n",
+            "06/15 06:51:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 236 out of 236 frames are valid.\n",
+            "06/15 06:51:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - 120 out of 120 frames are valid.\n",
+            "06/15 06:51:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "06/15 06:51:07 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb_20220901-e7b65fad.pth\n",
+            "100% 124M/124M [00:05<00:00, 25.9MB/s]\n",
+            "06/15 06:51:12 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: cls_head.fc_cls.weight, cls_head.fc_cls.bias\n",
+            "\n",
+            "missing keys in source state_dict: roi_head.bbox_head.fc_cls.weight, roi_head.bbox_head.fc_cls.bias\n",
+            "\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\" to /root/.cache/torch/hub/checkpoints/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "100% 122M/122M [00:04<00:00, 29.7MB/s]\n",
+            "06/15 06:51:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Load checkpoint from https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "06/15 06:51:17 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"FileClient\" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io\n",
+            "06/15 06:51:17 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - \"HardDiskBackend\" is the alias of \"LocalBackend\" and the former will be deprecated in future.\n",
+            "06/15 06:51:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Checkpoints will be saved to /content/mmaction2/projects/stad_tutorial/work_dirs/stad_model.\n",
+            "06/15 06:51:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 20/118]  lr: 1.0000e-03  eta: 0:07:06  time: 0.4613  data_time: 0.0472  memory: 1381  grad_norm: 17.8613  loss: 1.1505  recall@thr=0.5: 0.6667  prec@thr=0.5: 0.6667  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 1.1505\n",
+            "06/15 06:51:31 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 40/118]  lr: 1.0000e-03  eta: 0:05:28  time: 0.2655  data_time: 0.0204  memory: 1381  grad_norm: 6.8642  loss: 0.5417  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5417\n",
+            "06/15 06:51:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 60/118]  lr: 1.0000e-03  eta: 0:05:06  time: 0.3121  data_time: 0.0505  memory: 1381  grad_norm: 5.3190  loss: 0.6625  recall@thr=0.5: 0.9000  prec@thr=0.5: 0.9000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.6625\n",
+            "06/15 06:51:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][ 80/118]  lr: 1.0000e-03  eta: 0:04:44  time: 0.2771  data_time: 0.0255  memory: 1381  grad_norm: 3.0057  loss: 0.6646  recall@thr=0.5: 0.9231  prec@thr=0.5: 0.9231  recall@top3: 0.9231  prec@top3: 0.3077  recall@top5: 0.9231  prec@top5: 0.1846  loss_action_cls: 0.6646\n",
+            "06/15 06:51:48 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][100/118]  lr: 1.0000e-03  eta: 0:04:26  time: 0.2625  data_time: 0.0130  memory: 1381  grad_norm: 1.8442  loss: 0.5711  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5711\n",
+            "06/15 06:51:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:51:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [1][118/118]  lr: 1.0000e-03  eta: 0:04:18  time: 0.2930  data_time: 0.0322  memory: 1381  grad_norm: 2.5183  loss: 0.6887  recall@thr=0.5: 0.6923  prec@thr=0.5: 0.6923  recall@top3: 0.6923  prec@top3: 0.2308  recall@top5: 0.6923  prec@top5: 0.1385  loss_action_cls: 0.6887\n",
+            "06/15 06:51:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "06/15 06:51:59 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 20/120]    eta: 0:00:14  time: 0.1446  data_time: 0.0853  memory: 466  \n",
+            "06/15 06:52:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 40/120]    eta: 0:00:10  time: 0.1124  data_time: 0.0612  memory: 466  \n",
+            "06/15 06:52:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 60/120]    eta: 0:00:07  time: 0.1016  data_time: 0.0505  memory: 466  \n",
+            "06/15 06:52:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][ 80/120]    eta: 0:00:04  time: 0.1083  data_time: 0.0581  memory: 466  \n",
+            "06/15 06:52:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][100/120]    eta: 0:00:02  time: 0.1650  data_time: 0.1102  memory: 466  \n",
+            "06/15 06:52:11 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    eta: 0:00:00  time: 0.1410  data_time: 0.0866  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    47.41\n",
+            "aerobic split jump      30.01\n",
+            "aerobic scissors leap    88.94\n",
+            "aerobic turn            98.43\n",
+            "mAP                     66.20\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump    25.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     56.25\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     36.25\n",
+            "06/15 06:52:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [1][120/120]    mAP/frameAP: 66.1965  mAP/v_map@0.2: 56.2500  mAP/v_map@0.5: 36.2500  mAP/v_map_0.05:0.45: 50.4167  mAP/v_map_0.10:0.90: 37.7963  mAP/v_map_0.50:0.95: 26.8167  data_time: 0.0753  time: 0.1288\n",
+            "06/15 06:52:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - The best checkpoint with 66.1965 mAP/frameAP at 1 epoch is saved to best_mAP_frameAP_epoch_1.pth.\n",
+            "06/15 06:52:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 20/118]  lr: 3.2500e-03  eta: 0:04:11  time: 0.3098  data_time: 0.0484  memory: 1381  grad_norm: 1.1745  loss: 0.4384  recall@thr=0.5: 0.7857  prec@thr=0.5: 0.7857  recall@top3: 0.9286  prec@top3: 0.3095  recall@top5: 0.9286  prec@top5: 0.1857  loss_action_cls: 0.4384\n",
+            "06/15 06:52:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 40/118]  lr: 3.2500e-03  eta: 0:04:06  time: 0.3245  data_time: 0.0667  memory: 1381  grad_norm: 1.0271  loss: 0.3960  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 0.9333  prec@top3: 0.3111  recall@top5: 0.9333  prec@top5: 0.1867  loss_action_cls: 0.3960\n",
+            "06/15 06:52:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 60/118]  lr: 3.2500e-03  eta: 0:03:55  time: 0.2572  data_time: 0.0111  memory: 1381  grad_norm: 0.8150  loss: 0.3958  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3958\n",
+            "06/15 06:52:41 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][ 80/118]  lr: 3.2500e-03  eta: 0:03:47  time: 0.2843  data_time: 0.0167  memory: 1381  grad_norm: 1.4691  loss: 0.4575  recall@thr=0.5: 0.9333  prec@thr=0.5: 0.9333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4575\n",
+            "06/15 06:52:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][100/118]  lr: 3.2500e-03  eta: 0:03:41  time: 0.3118  data_time: 0.0559  memory: 1381  grad_norm: 1.9420  loss: 0.5529  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.5529\n",
+            "06/15 06:52:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:52:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [2][118/118]  lr: 3.2500e-03  eta: 0:03:33  time: 0.2532  data_time: 0.0082  memory: 1381  grad_norm: 1.6790  loss: 0.4253  recall@thr=0.5: 0.7500  prec@thr=0.5: 0.7500  recall@top3: 0.8333  prec@top3: 0.2778  recall@top5: 0.8333  prec@top5: 0.1667  loss_action_cls: 0.4253\n",
+            "06/15 06:52:52 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 2 epochs\n",
+            "06/15 06:52:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 20/120]    eta: 0:00:15  time: 0.1515  data_time: 0.0968  memory: 466  \n",
+            "06/15 06:53:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 40/120]    eta: 0:00:12  time: 0.1679  data_time: 0.1143  memory: 466  \n",
+            "06/15 06:53:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 60/120]    eta: 0:00:08  time: 0.1134  data_time: 0.0631  memory: 466  \n",
+            "06/15 06:53:04 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][ 80/120]    eta: 0:00:05  time: 0.0961  data_time: 0.0459  memory: 466  \n",
+            "06/15 06:53:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][100/120]    eta: 0:00:02  time: 0.1063  data_time: 0.0549  memory: 466  \n",
+            "06/15 06:53:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    eta: 0:00:00  time: 0.1017  data_time: 0.0522  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    42.09\n",
+            "aerobic split jump      27.71\n",
+            "aerobic scissors leap    90.02\n",
+            "aerobic turn            95.76\n",
+            "mAP                     63.89\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     55.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:53:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [2][120/120]    mAP/frameAP: 63.8934  mAP/v_map@0.2: 55.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 51.8889  mAP/v_map_0.10:0.90: 34.0278  mAP/v_map_0.50:0.95: 18.7250  data_time: 0.0710  time: 0.1226\n",
+            "06/15 06:53:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 20/118]  lr: 5.5000e-03  eta: 0:03:34  time: 0.4330  data_time: 0.1493  memory: 1381  grad_norm: 0.4795  loss: 0.5049  recall@thr=0.5: 0.8462  prec@thr=0.5: 0.8462  recall@top3: 0.8462  prec@top3: 0.2821  recall@top5: 0.8462  prec@top5: 0.1692  loss_action_cls: 0.5049\n",
+            "06/15 06:53:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 40/118]  lr: 5.5000e-03  eta: 0:03:27  time: 0.2948  data_time: 0.0370  memory: 1381  grad_norm: 0.8584  loss: 0.4820  recall@thr=0.5: 0.6154  prec@thr=0.5: 0.6154  recall@top3: 0.6154  prec@top3: 0.2051  recall@top5: 0.6154  prec@top5: 0.1231  loss_action_cls: 0.4820\n",
+            "06/15 06:53:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 60/118]  lr: 5.5000e-03  eta: 0:03:19  time: 0.2622  data_time: 0.0118  memory: 1381  grad_norm: 1.1041  loss: 0.2944  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2944\n",
+            "06/15 06:53:35 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][ 80/118]  lr: 5.5000e-03  eta: 0:03:13  time: 0.3111  data_time: 0.0470  memory: 1381  grad_norm: 0.8394  loss: 0.3393  recall@thr=0.5: 0.9091  prec@thr=0.5: 0.9091  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3393\n",
+            "06/15 06:53:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][100/118]  lr: 5.5000e-03  eta: 0:03:06  time: 0.2989  data_time: 0.0417  memory: 1381  grad_norm: 0.2155  loss: 0.4345  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 0.8182  prec@top3: 0.2727  recall@top5: 0.8182  prec@top5: 0.1636  loss_action_cls: 0.4345\n",
+            "06/15 06:53:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:53:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [3][118/118]  lr: 5.5000e-03  eta: 0:02:59  time: 0.2576  data_time: 0.0112  memory: 1381  grad_norm: 0.2509  loss: 0.4634  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4634\n",
+            "06/15 06:53:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 3 epochs\n",
+            "06/15 06:53:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 20/120]    eta: 0:00:18  time: 0.1815  data_time: 0.1180  memory: 466  \n",
+            "06/15 06:53:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 40/120]    eta: 0:00:13  time: 0.1451  data_time: 0.0905  memory: 466  \n",
+            "06/15 06:53:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 60/120]    eta: 0:00:08  time: 0.1020  data_time: 0.0510  memory: 466  \n",
+            "06/15 06:53:57 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][ 80/120]    eta: 0:00:05  time: 0.1008  data_time: 0.0528  memory: 466  \n",
+            "06/15 06:54:00 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][100/120]    eta: 0:00:02  time: 0.1072  data_time: 0.0569  memory: 466  \n",
+            "06/15 06:54:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120]    eta: 0:00:00  time: 0.1018  data_time: 0.0536  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    37.09\n",
+            "aerobic split jump      27.98\n",
+            "aerobic scissors leap    89.41\n",
+            "aerobic turn            95.67\n",
+            "mAP                     62.54\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap   100.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     55.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    36.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     34.00\n",
+            "06/15 06:54:02 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [3][120/120]    mAP/frameAP: 62.5361  mAP/v_map@0.2: 55.0000  mAP/v_map@0.5: 34.0000  mAP/v_map_0.05:0.45: 51.2222  mAP/v_map_0.10:0.90: 34.1389  mAP/v_map_0.50:0.95: 18.7250  data_time: 0.0704  time: 0.1229\n",
+            "06/15 06:54:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 20/118]  lr: 7.7500e-03  eta: 0:02:55  time: 0.3717  data_time: 0.0993  memory: 1381  grad_norm: 0.2139  loss: 0.3119  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3119\n",
+            "06/15 06:54:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 40/118]  lr: 7.7500e-03  eta: 0:02:48  time: 0.2730  data_time: 0.0230  memory: 1381  grad_norm: 0.6102  loss: 0.4782  recall@thr=0.5: 0.9375  prec@thr=0.5: 0.9375  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4782\n",
+            "06/15 06:54:21 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 60/118]  lr: 7.7500e-03  eta: 0:02:41  time: 0.2895  data_time: 0.0311  memory: 1381  grad_norm: 0.4057  loss: 0.3422  recall@thr=0.5: 0.9474  prec@thr=0.5: 0.9474  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3422\n",
+            "06/15 06:54:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][ 80/118]  lr: 7.7500e-03  eta: 0:02:36  time: 0.3170  data_time: 0.0490  memory: 1381  grad_norm: 0.3051  loss: 0.3628  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3628\n",
+            "06/15 06:54:32 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][100/118]  lr: 7.7500e-03  eta: 0:02:29  time: 0.2633  data_time: 0.0131  memory: 1381  grad_norm: 0.1671  loss: 0.3691  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3691\n",
+            "06/15 06:54:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:54:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [4][118/118]  lr: 7.7500e-03  eta: 0:02:23  time: 0.2721  data_time: 0.0181  memory: 1381  grad_norm: 0.1954  loss: 0.3076  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3076\n",
+            "06/15 06:54:37 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 4 epochs\n",
+            "06/15 06:54:43 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 20/120]    eta: 0:00:14  time: 0.1431  data_time: 0.0854  memory: 466  \n",
+            "06/15 06:54:45 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 40/120]    eta: 0:00:10  time: 0.1086  data_time: 0.0584  memory: 466  \n",
+            "06/15 06:54:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 60/120]    eta: 0:00:07  time: 0.1056  data_time: 0.0552  memory: 466  \n",
+            "06/15 06:54:49 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][ 80/120]    eta: 0:00:04  time: 0.0922  data_time: 0.0399  memory: 466  \n",
+            "06/15 06:54:51 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][100/120]    eta: 0:00:02  time: 0.1166  data_time: 0.0671  memory: 466  \n",
+            "06/15 06:54:54 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120]    eta: 0:00:00  time: 0.1468  data_time: 0.0927  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    25.62\n",
+            "aerobic split jump      28.75\n",
+            "aerobic scissors leap    89.02\n",
+            "aerobic turn            93.30\n",
+            "mAP                     59.17\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump      20.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     50.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn           100.00\n",
+            "mAP                     36.25\n",
+            "06/15 06:54:55 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [4][120/120]    mAP/frameAP: 59.1749  mAP/v_map@0.2: 50.0000  mAP/v_map@0.5: 36.2500  mAP/v_map_0.05:0.45: 46.9444  mAP/v_map_0.10:0.90: 28.9352  mAP/v_map_0.50:0.95: 14.6667  data_time: 0.0663  time: 0.1186\n",
+            "06/15 06:55:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 20/118]  lr: 1.0000e-02  eta: 0:02:17  time: 0.3090  data_time: 0.0513  memory: 1381  grad_norm: 0.2988  loss: 0.3067  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3067\n",
+            "06/15 06:55:06 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 40/118]  lr: 1.0000e-02  eta: 0:02:10  time: 0.2584  data_time: 0.0142  memory: 1381  grad_norm: 0.6702  loss: 0.3996  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3996\n",
+            "06/15 06:55:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 60/118]  lr: 1.0000e-02  eta: 0:02:04  time: 0.3286  data_time: 0.0617  memory: 1381  grad_norm: 0.4347  loss: 0.4374  recall@thr=0.5: 0.8462  prec@thr=0.5: 0.8462  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4374\n",
+            "06/15 06:55:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][ 80/118]  lr: 1.0000e-02  eta: 0:01:58  time: 0.2774  data_time: 0.0247  memory: 1381  grad_norm: 0.4373  loss: 0.3679  recall@thr=0.5: 0.7500  prec@thr=0.5: 0.7500  recall@top3: 0.8750  prec@top3: 0.2917  recall@top5: 0.8750  prec@top5: 0.1750  loss_action_cls: 0.3679\n",
+            "06/15 06:55:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][100/118]  lr: 1.0000e-02  eta: 0:01:51  time: 0.2603  data_time: 0.0108  memory: 1381  grad_norm: 0.2507  loss: 0.3226  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3226\n",
+            "06/15 06:55:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:55:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [5][118/118]  lr: 1.0000e-02  eta: 0:01:46  time: 0.3256  data_time: 0.0497  memory: 1381  grad_norm: 0.0940  loss: 0.2914  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2914\n",
+            "06/15 06:55:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 5 epochs\n",
+            "06/15 06:55:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 20/120]    eta: 0:00:11  time: 0.1166  data_time: 0.0625  memory: 466  \n",
+            "06/15 06:55:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 40/120]    eta: 0:00:09  time: 0.1119  data_time: 0.0618  memory: 466  \n",
+            "06/15 06:55:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 60/120]    eta: 0:00:06  time: 0.1012  data_time: 0.0504  memory: 466  \n",
+            "06/15 06:55:40 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][ 80/120]    eta: 0:00:04  time: 0.1017  data_time: 0.0537  memory: 466  \n",
+            "06/15 06:55:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][100/120]    eta: 0:00:02  time: 0.1766  data_time: 0.1239  memory: 466  \n",
+            "06/15 06:55:46 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120]    eta: 0:00:00  time: 0.1421  data_time: 0.0884  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    17.82\n",
+            "aerobic split jump      20.05\n",
+            "aerobic scissors leap    89.00\n",
+            "aerobic turn            91.20\n",
+            "mAP                     54.52\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn            60.00\n",
+            "mAP                     35.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn            26.67\n",
+            "mAP                     17.92\n",
+            "06/15 06:55:47 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [5][120/120]    mAP/frameAP: 54.5189  mAP/v_map@0.2: 35.0000  mAP/v_map@0.5: 17.9167  mAP/v_map_0.05:0.45: 31.2037  mAP/v_map_0.10:0.90: 19.0741  mAP/v_map_0.50:0.95: 9.5833  data_time: 0.0733  time: 0.1249\n",
+            "06/15 06:55:53 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 20/118]  lr: 1.0000e-02  eta: 0:01:40  time: 0.2867  data_time: 0.0385  memory: 1381  grad_norm: 0.1572  loss: 0.3008  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3008\n",
+            "06/15 06:55:58 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 40/118]  lr: 1.0000e-02  eta: 0:01:34  time: 0.2720  data_time: 0.0167  memory: 1381  grad_norm: 0.0803  loss: 0.2377  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2377\n",
+            "06/15 06:56:05 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 60/118]  lr: 1.0000e-02  eta: 0:01:28  time: 0.3423  data_time: 0.0840  memory: 1381  grad_norm: 0.3120  loss: 0.2442  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2442\n",
+            "06/15 06:56:10 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][ 80/118]  lr: 1.0000e-02  eta: 0:01:22  time: 0.2580  data_time: 0.0112  memory: 1381  grad_norm: 0.5726  loss: 0.3794  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3794\n",
+            "06/15 06:56:16 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][100/118]  lr: 1.0000e-02  eta: 0:01:16  time: 0.2949  data_time: 0.0347  memory: 1381  grad_norm: 0.1732  loss: 0.3004  recall@thr=0.5: 0.8750  prec@thr=0.5: 0.8750  recall@top3: 0.8750  prec@top3: 0.2917  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3004\n",
+            "06/15 06:56:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:56:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [6][118/118]  lr: 1.0000e-02  eta: 0:01:10  time: 0.3258  data_time: 0.0625  memory: 1381  grad_norm: 0.3709  loss: 0.3439  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3439\n",
+            "06/15 06:56:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 6 epochs\n",
+            "06/15 06:56:26 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 20/120]    eta: 0:00:11  time: 0.1169  data_time: 0.0624  memory: 466  \n",
+            "06/15 06:56:28 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 40/120]    eta: 0:00:09  time: 0.1131  data_time: 0.0631  memory: 466  \n",
+            "06/15 06:56:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 60/120]    eta: 0:00:06  time: 0.1064  data_time: 0.0553  memory: 466  \n",
+            "06/15 06:56:33 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][ 80/120]    eta: 0:00:04  time: 0.1401  data_time: 0.0862  memory: 466  \n",
+            "06/15 06:56:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][100/120]    eta: 0:00:02  time: 0.1519  data_time: 0.0982  memory: 466  \n",
+            "06/15 06:56:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120]    eta: 0:00:00  time: 0.0986  data_time: 0.0486  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    19.05\n",
+            "aerobic split jump      22.20\n",
+            "aerobic scissors leap    85.83\n",
+            "aerobic turn            79.04\n",
+            "mAP                     51.53\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn             0.00\n",
+            "mAP                     20.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn             0.00\n",
+            "mAP                     11.25\n",
+            "06/15 06:56:38 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [6][120/120]    mAP/frameAP: 51.5300  mAP/v_map@0.2: 20.0000  mAP/v_map@0.5: 11.2500  mAP/v_map_0.05:0.45: 18.0556  mAP/v_map_0.10:0.90: 11.8519  mAP/v_map_0.50:0.95: 6.9167  data_time: 0.0688  time: 0.1209\n",
+            "06/15 06:56:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 20/118]  lr: 1.0000e-03  eta: 0:01:04  time: 0.2819  data_time: 0.0331  memory: 1381  grad_norm: 0.2811  loss: 0.2776  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2776\n",
+            "06/15 06:56:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 40/118]  lr: 1.0000e-03  eta: 0:00:58  time: 0.3114  data_time: 0.0473  memory: 1381  grad_norm: 0.1573  loss: 0.2043  recall@thr=0.5: 0.8182  prec@thr=0.5: 0.8182  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2043\n",
+            "06/15 06:56:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 60/118]  lr: 1.0000e-03  eta: 0:00:52  time: 0.2903  data_time: 0.0342  memory: 1381  grad_norm: 0.1343  loss: 0.3411  recall@thr=0.5: 0.8667  prec@thr=0.5: 0.8667  recall@top3: 0.8667  prec@top3: 0.2889  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3411\n",
+            "06/15 06:57:01 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][ 80/118]  lr: 1.0000e-03  eta: 0:00:46  time: 0.2623  data_time: 0.0128  memory: 1381  grad_norm: 0.1026  loss: 0.2895  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2895\n",
+            "06/15 06:57:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][100/118]  lr: 1.0000e-03  eta: 0:00:40  time: 0.3206  data_time: 0.0503  memory: 1381  grad_norm: 0.1911  loss: 0.3552  recall@thr=0.5: 0.7333  prec@thr=0.5: 0.7333  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3552\n",
+            "06/15 06:57:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:57:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [7][118/118]  lr: 1.0000e-03  eta: 0:00:35  time: 0.2884  data_time: 0.0335  memory: 1381  grad_norm: 0.1274  loss: 0.4391  recall@thr=0.5: 0.8571  prec@thr=0.5: 0.8571  recall@top3: 0.8571  prec@top3: 0.2857  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.4391\n",
+            "06/15 06:57:13 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 7 epochs\n",
+            "06/15 06:57:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 20/120]    eta: 0:00:11  time: 0.1193  data_time: 0.0693  memory: 466  \n",
+            "06/15 06:57:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 40/120]    eta: 0:00:09  time: 0.1188  data_time: 0.0670  memory: 466  \n",
+            "06/15 06:57:23 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 60/120]    eta: 0:00:08  time: 0.1645  data_time: 0.1114  memory: 466  \n",
+            "06/15 06:57:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][ 80/120]    eta: 0:00:05  time: 0.1391  data_time: 0.0850  memory: 466  \n",
+            "06/15 06:57:27 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][100/120]    eta: 0:00:02  time: 0.1104  data_time: 0.0585  memory: 466  \n",
+            "06/15 06:57:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120]    eta: 0:00:00  time: 0.1025  data_time: 0.0512  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    20.79\n",
+            "aerobic split jump      20.11\n",
+            "aerobic scissors leap    84.84\n",
+            "aerobic turn            78.58\n",
+            "mAP                     51.08\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     25.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn             0.00\n",
+            "mAP                     11.25\n",
+            "06/15 06:57:30 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [7][120/120]    mAP/frameAP: 51.0794  mAP/v_map@0.2: 25.0000  mAP/v_map@0.5: 11.2500  mAP/v_map_0.05:0.45: 22.5000  mAP/v_map_0.10:0.90: 14.0741  mAP/v_map_0.50:0.95: 6.9167  data_time: 0.0735  time: 0.1255\n",
+            "06/15 06:57:36 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 20/118]  lr: 1.0000e-04  eta: 0:00:29  time: 0.2894  data_time: 0.0322  memory: 1381  grad_norm: 0.1227  loss: 0.3286  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3286\n",
+            "06/15 06:57:44 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 40/118]  lr: 1.0000e-04  eta: 0:00:23  time: 0.4105  data_time: 0.1257  memory: 1381  grad_norm: 0.1948  loss: 0.3202  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3202\n",
+            "06/15 06:57:50 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 60/118]  lr: 1.0000e-04  eta: 0:00:17  time: 0.3095  data_time: 0.0537  memory: 1381  grad_norm: 0.7997  loss: 0.2428  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2428\n",
+            "06/15 06:57:56 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][ 80/118]  lr: 1.0000e-04  eta: 0:00:11  time: 0.2918  data_time: 0.0330  memory: 1381  grad_norm: 0.8157  loss: 0.3045  recall@thr=0.5: 1.0000  prec@thr=0.5: 1.0000  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.3045\n",
+            "06/15 06:58:03 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][100/118]  lr: 1.0000e-04  eta: 0:00:05  time: 0.3443  data_time: 0.0786  memory: 1381  grad_norm: 0.0966  loss: 0.2605  recall@thr=0.5: 0.9375  prec@thr=0.5: 0.9375  recall@top3: 1.0000  prec@top3: 0.3333  recall@top5: 1.0000  prec@top5: 0.2000  loss_action_cls: 0.2605\n",
+            "06/15 06:58:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Exp name: slowonly_k400_multisports_20230615_065057\n",
+            "06/15 06:58:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(train) [8][118/118]  lr: 1.0000e-04  eta: 0:00:00  time: 0.2611  data_time: 0.0148  memory: 1381  grad_norm: 0.3034  loss: 0.2694  recall@thr=0.5: 0.9231  prec@thr=0.5: 0.9231  recall@top3: 0.9231  prec@top3: 0.3077  recall@top5: 0.9231  prec@top5: 0.1846  loss_action_cls: 0.2694\n",
+            "06/15 06:58:08 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Saving checkpoint at 8 epochs\n",
+            "06/15 06:58:12 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 20/120]    eta: 0:00:14  time: 0.1433  data_time: 0.0869  memory: 466  \n",
+            "06/15 06:58:15 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 40/120]    eta: 0:00:12  time: 0.1664  data_time: 0.1160  memory: 466  \n",
+            "06/15 06:58:18 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 60/120]    eta: 0:00:08  time: 0.1269  data_time: 0.0772  memory: 466  \n",
+            "06/15 06:58:20 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][ 80/120]    eta: 0:00:05  time: 0.0951  data_time: 0.0455  memory: 466  \n",
+            "06/15 06:58:22 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][100/120]    eta: 0:00:02  time: 0.1144  data_time: 0.0630  memory: 466  \n",
+            "06/15 06:58:24 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120]    eta: 0:00:00  time: 0.1028  data_time: 0.0530  memory: 466  \n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluate aerobic kick jump\n",
+            "do not evaluate aerobic off axis jump\n",
+            "do not evaluate aerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluate aerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluate basketball save\n",
+            "do not evaluate basketball jump ball\n",
+            "frameAP_0.5\n",
+            "\n",
+            "aerobic straight jump    15.29\n",
+            "aerobic split jump      20.74\n",
+            "aerobic scissors leap    86.38\n",
+            "aerobic turn            80.98\n",
+            "mAP                     50.85\n",
+            "\u001b[2Klinking tubes... \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hno such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.2\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    80.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     25.00\n",
+            "no such label 0 aerobic push up\n",
+            "no such label 1 aerobic explosive push up\n",
+            "no such label 2 aerobic explosive support\n",
+            "no such label 3 aerobic leg circle\n",
+            "no such label 4 aerobic helicopter\n",
+            "no such label 5 aerobic support\n",
+            "no such label 6 aerobic v support\n",
+            "no such label 7 aerobic horizontal support\n",
+            "no such label 9 aerobic illusion\n",
+            "no such label 10 aerobic bent leg(s) jump\n",
+            "no such label 11 aerobic pike jump\n",
+            "no such label 12 aerobic straddle jump\n",
+            "do not evaluateaerobic kick jump\n",
+            "do not evaluateaerobic off axis jump\n",
+            "do not evaluateaerobic butterfly jump\n",
+            "no such label 18 aerobic split\n",
+            "do not evaluateaerobic balance turn\n",
+            "no such label 21 volleyball serve\n",
+            "no such label 22 volleyball block\n",
+            "no such label 23 volleyball first pass\n",
+            "no such label 24 volleyball defend\n",
+            "no such label 25 volleyball protect\n",
+            "no such label 26 volleyball second pass\n",
+            "no such label 27 volleyball adjust\n",
+            "no such label 28 volleyball save\n",
+            "no such label 29 volleyball second attack\n",
+            "no such label 30 volleyball spike\n",
+            "no such label 31 volleyball dink\n",
+            "no such label 32 volleyball no offensive attack\n",
+            "no such label 33 football shoot\n",
+            "no such label 34 football long pass\n",
+            "no such label 35 football short pass\n",
+            "no such label 36 football through pass\n",
+            "no such label 37 football cross\n",
+            "no such label 38 football dribble\n",
+            "no such label 39 football trap\n",
+            "no such label 40 football throw\n",
+            "no such label 41 football diving\n",
+            "no such label 42 football tackle\n",
+            "no such label 43 football steal\n",
+            "no such label 44 football clearance\n",
+            "no such label 45 football block\n",
+            "no such label 46 football press\n",
+            "no such label 47 football aerial duels\n",
+            "no such label 48 basketball pass\n",
+            "no such label 49 basketball drive\n",
+            "no such label 50 basketball dribble\n",
+            "no such label 51 basketball 3-point shot\n",
+            "no such label 52 basketball 2-point shot\n",
+            "no such label 53 basketball free throw\n",
+            "no such label 54 basketball block\n",
+            "no such label 55 basketball offensive rebound\n",
+            "no such label 56 basketball defensive rebound\n",
+            "no such label 57 basketball pass steal\n",
+            "no such label 58 basketball dribble steal\n",
+            "no such label 59 basketball interfere shot\n",
+            "no such label 60 basketball pick-and-roll defensive\n",
+            "no such label 61 basketball sag\n",
+            "no such label 62 basketball screen\n",
+            "no such label 63 basketball pass-inbound\n",
+            "do not evaluatebasketball save\n",
+            "do not evaluatebasketball jump ball\n",
+            "VideoAP_0.5\n",
+            "\n",
+            "aerobic straight jump     0.00\n",
+            "aerobic split jump       0.00\n",
+            "aerobic scissors leap    45.00\n",
+            "aerobic turn            20.00\n",
+            "mAP                     16.25\n",
+            "06/15 06:58:25 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Epoch(val) [8][120/120]    mAP/frameAP: 50.8487  mAP/v_map@0.2: 25.0000  mAP/v_map@0.5: 16.2500  mAP/v_map_0.05:0.45: 23.0556  mAP/v_map_0.10:0.90: 15.1852  mAP/v_map_0.50:0.95: 8.4167  data_time: 0.0732  time: 0.1244\n",
+            "\u001b[32mTraining finished successfully. \u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "# 使用 MIM 训练模型\n",
+        "!mim train mmaction2 configs/slowonly_k400_multisports.py \\\n",
+        "    --work-dir work_dirs/stad_model/"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HCg6C9HYxRAt"
+      },
+      "source": [
+        "## 4. 时空行为检测模型推理\n",
+        "\n",
+        "训练得到检测模型和时空行为检测模型后，我们可以利用时空行为检测 demo 进行推理，可视化模型效果。\n",
+        "\n",
+        "由于 tutorial 中使用的训练数据较少，模型性能较差，所以可视化时使用预先训练好的模型。"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WW5-IJ7IxRAu"
+      },
+      "source": [
+        "###"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FofW_5RoxRAu",
+        "outputId": "91217660-946d-48ab-f663-b0f7f2d6a6f6"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n",
+            "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n",
+            "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n",
+            "ALSA lib confmisc.c:767:(parse_card) cannot find card '0'\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory\n",
+            "ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name\n",
+            "ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory\n",
+            "ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory\n",
+            "ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default\n",
+            "Loads checkpoint by local backend from path: work_dirs/det_model/epoch_2.pth\n",
+            "Performing Human Detection for each frame\n",
+            "[>>] 99/99, 6.8 task/s, elapsed: 15s, ETA:     0s\n",
+            "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth\n",
+            "Performing SpatioTemporal Action Detection for each clip\n",
+            "[>>] 99/99, 16.6 task/s, elapsed: 6s, ETA:     0sPerforming visualization\n",
+            "Moviepy - Building video data/demo_spatiotemporal_det.mp4.\n",
+            "Moviepy - Writing video data/demo_spatiotemporal_det.mp4\n",
+            "\n",
+            "Moviepy - Done !\n",
+            "Moviepy - video ready data/demo_spatiotemporal_det.mp4\n"
+          ]
+        }
+      ],
+      "source": [
+        "!python ../../demo/demo_spatiotemporal_det.py \\\n",
+        "    data/multisports/test/aerobic_gymnastics/v_7G_IpU0FxLU_c001.mp4 \\\n",
+        "    data/demo_spatiotemporal_det.mp4 \\\n",
+        "    --config configs/slowonly_k400_multisports.py \\\n",
+        "    --checkpoint https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb_20230320-a1ca5e76.pth \\\n",
+        "    --det-config configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco_ms_person.py \\\n",
+        "    --det-checkpoint work_dirs/det_model/epoch_2.pth \\\n",
+        "    --det-score-thr 0.85 \\\n",
+        "    --action-score-thr 0.8 \\\n",
+        "    --label-map ../../tools/data/multisports/label_map.txt \\\n",
+        "    --predict-stepsize 8 \\\n",
+        "    --output-stepsize 1 \\\n",
+        "    --output-fps 24"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 741
+        },
+        "id": "677FUWFRxRAv",
+        "outputId": "f702d544-3492-494c-af81-9e90f43d6b6c"
+      },
+      "outputs": [],
+      "source": [
+        "# Show Video\n",
+        "import moviepy.editor\n",
+        "moviepy.editor.ipython_display(\"data/demo_spatiotemporal_det.mp4\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.0"
+    },
+    "orig_nbformat": 4
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/projects/stad_tutorial/tools/convert_proposals.py b/projects/stad_tutorial/tools/convert_proposals.py
new file mode 100644
index 0000000000..4e43c6dd89
--- /dev/null
+++ b/projects/stad_tutorial/tools/convert_proposals.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import numpy as np
+from mmengine import dump, load, track_iter_progress
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--det_test_result',
+        default='data/multisports/annotations/ms_det_proposals.pkl')
+    parser.add_argument(
+        '--stad_gt',
+        help='spatio-temporal action detection ground truth file',
+        default='data/multisports/annotations/multisports_GT.pkl')
+    parser.add_argument(
+        '--out_result',
+        default='data/multisports/annotations/multisports_proposals.pkl')
+    args = parser.parse_args()
+    return args
+
+
+def dump_det_result(args):
+    print('loading test result...')
+    det_result = load(args.det_test_result)
+    stad_gt = load(args.stad_gt)
+    train_list = stad_gt['train_videos'][0]
+    val_list = stad_gt['test_videos'][0]
+    train_bbox_result = {}
+    val_bbox_result = {}
+    for sample in track_iter_progress(det_result):
+        bboxes = sample['pred_instances']['bboxes']
+        scores = sample['pred_instances']['scores']
+        h, w = sample['ori_shape']
+        bboxes[:, ::2] /= w
+        bboxes[:, 1::2] /= h
+        img_path = sample['img_path']
+        frm_key_list = img_path.split('.jpg')[0].split('/')
+        frm_key = ','.join([
+            f'{frm_key_list[-3]}/{frm_key_list[-2]}.mp4',
+            f'{int(frm_key_list[-1]):04d}'
+        ])
+        bbox = np.concatenate([bboxes, scores[:, None]], axis=1)
+
+        vid_key = '/'.join(frm_key_list[-3:-1])
+        if vid_key in train_list:
+            train_bbox_result[frm_key] = bbox
+        elif vid_key in val_list:
+            val_bbox_result[frm_key] = bbox
+        else:
+            raise KeyError(vid_key)
+    dump(train_bbox_result, args.out_result[:-4] + '_train.pkl')
+    dump(val_bbox_result, args.out_result[:-4] + '_val.pkl')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    dump_det_result(args)
diff --git a/projects/stad_tutorial/tools/generate_mmdet_anno.py b/projects/stad_tutorial/tools/generate_mmdet_anno.py
new file mode 100644
index 0000000000..7b4ba62994
--- /dev/null
+++ b/projects/stad_tutorial/tools/generate_mmdet_anno.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import defaultdict
+
+from mmengine import dump, load
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'stad_anno', help='spatiotemporal action detection anno path')
+    parser.add_argument('det_path', help='output detection anno path')
+    args = parser.parse_args()
+    return args
+
+
+def generate_mmdet_coco_anno(args):
+    ori_anno = load(args.stad_anno)
+    train_videos = ori_anno['train_videos']
+    val_videos = ori_anno['test_videos']
+    videos = {'train': train_videos, 'val': val_videos}
+    for split in ['train', 'val']:
+        img_id = 0
+        bbox_id = 0
+        img_list = []
+        anno_list = []
+        for vid in videos[split][0]:
+            vid_tubes = ori_anno['gttubes'][vid]
+            height, width = ori_anno['resolution'][vid]
+            frm2bbox = defaultdict(list)
+            for label_idx, tube_list in vid_tubes.items():
+                for tube in tube_list:
+                    for frm_anno in tube:
+                        frm_idx, bbox = frm_anno[0], frm_anno[1:]
+                        frm2bbox[frm_idx].append({'label': 0, 'bbox': bbox})
+            for frm_idx, frm_bboxes in frm2bbox.items():
+                img_path = f'{vid}/{int(frm_idx):05d}.jpg'
+                img_instance = {
+                    'file_name': img_path,
+                    'height': height,
+                    'width': width,
+                    'id': img_id
+                }
+                img_list.append(img_instance)
+
+                for bbox_info in frm_bboxes:
+                    label = bbox_info['label']
+                    x1, y1, x2, y2 = bbox_info['bbox']
+                    bbox = [x1, y1, x2 - x1, y2 - y1]
+                    anno_instance = {
+                        'area': bbox[2] * bbox[3],
+                        'image_id': img_id,
+                        'bbox': bbox,
+                        'category_id': label,
+                        'iscrowd': 0,
+                        'id': bbox_id
+                    }
+                    anno_list.append(anno_instance)
+                    bbox_id += 1
+                img_id += 1
+        total_anno = {
+            'images': img_list,
+            'annotations': anno_list,
+            'categories': [{
+                'id': 0,
+                'name': 'person'
+            }],
+        }
+        dump(total_anno, args.det_path[:-5] + f'_{split}' + args.det_path[-5:])
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    generate_mmdet_coco_anno(args)
diff --git a/projects/stad_tutorial/tools/generate_rgb.py b/projects/stad_tutorial/tools/generate_rgb.py
new file mode 100644
index 0000000000..61952f4b21
--- /dev/null
+++ b/projects/stad_tutorial/tools/generate_rgb.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+
+import cv2
+
+src_dir = 'data/multisports/trainval'
+target_dir = 'data/multisports/rawframes'
+
+sport_list = ['aerobic_gymnastics']
+for sport in sport_list:
+    video_root = osp.join(src_dir, sport)
+    if not osp.exists(video_root):
+        print('No {} video dir to generate rgb images.'.format(video_root))
+        continue
+    print('Will generate {} rgb dir for {}.'.format(
+        len(os.listdir(video_root)), osp.basename(sport)))
+    for clip_name in os.listdir(video_root):
+        mp4_path = osp.join(video_root, clip_name)
+        save_dir = osp.join(target_dir, sport, clip_name[:-4])
+        if not osp.exists(save_dir):
+            os.makedirs(save_dir)
+        cap = cv2.VideoCapture(mp4_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+                int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+        fourcc = cv2.VideoWriter_fourcc(*'I420')
+        ii = 1
+        while (cap.isOpened()):
+            ret, frame = cap.read()
+            aa = str(ii)
+            s = aa.zfill(5)
+            image_name = osp.join(save_dir + '/' + s + '.jpg')
+            if ret is True:
+                cv2.imwrite(image_name, frame)
+            else:
+                break
+            ii = ii + 1
+        cap.release()
+        print('Generate {} rgb dir successfully.'.format(clip_name[:-4]))
diff --git a/projects/stad_tutorial/tools/images2coco.py b/projects/stad_tutorial/tools/images2coco.py
new file mode 100644
index 0000000000..18a2e4bb0d
--- /dev/null
+++ b/projects/stad_tutorial/tools/images2coco.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+from mmengine.fileio import dump, list_from_file
+from mmengine.utils import mkdir_or_exist, scandir, track_parallel_progress
+from PIL import Image
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert images to coco format without annotations')
+    parser.add_argument('img_path', help='The root path of images')
+    parser.add_argument(
+        'classes', type=str, help='The text file name of storage class list')
+    parser.add_argument(
+        'out',
+        type=str,
+        help='The output annotation json file name, The save dir is in the '
+        'same directory as img_path')
+    parser.add_argument(
+        '-e',
+        '--exclude-extensions',
+        type=str,
+        nargs='+',
+        help='The suffix of images to be excluded, such as "png" and "bmp"')
+    args = parser.parse_args()
+    return args
+
+
+def get_img_info(args):
+    path, image_path, exclude_extensions = args
+    if exclude_extensions is None or (
+            exclude_extensions is not None
+            and not image_path.lower().endswith(exclude_extensions)):
+        # image_path =
+        img_pillow = Image.open(os.path.join(path, image_path))
+        img_info = {
+            'filename': image_path,
+            'width': img_pillow.width,
+            'height': img_pillow.height,
+        }
+        return img_info
+
+
+def collect_image_infos(path, exclude_extensions=None):
+    img_infos = []
+
+    images_generator = scandir(path, recursive=True)
+
+    img_infos = track_parallel_progress(
+        get_img_info, [(path, image_path, exclude_extensions)
+                       for image_path in images_generator],
+        nproc=64)
+
+    return img_infos
+
+
+def cvt_to_coco_json(img_infos, classes):
+    image_id = 0
+    coco = dict()
+    coco['images'] = []
+    coco['type'] = 'instance'
+    coco['categories'] = []
+    coco['annotations'] = []
+    image_set = set()
+
+    for category_id, name in enumerate(classes):
+        category_item = dict()
+        category_item['supercategory'] = str('none')
+        category_item['id'] = int(category_id)
+        category_item['name'] = str(name)
+        coco['categories'].append(category_item)
+
+    for img_dict in img_infos:
+        file_name = img_dict['filename']
+        assert file_name not in image_set
+        image_item = dict()
+        image_item['id'] = int(image_id)
+        image_item['file_name'] = str(file_name)
+        image_item['height'] = int(img_dict['height'])
+        image_item['width'] = int(img_dict['width'])
+        coco['images'].append(image_item)
+        image_set.add(file_name)
+
+        image_id += 1
+    return coco
+
+
+def main():
+    args = parse_args()
+    assert args.out.endswith(
+        'json'), 'The output file name must be json suffix'
+
+    # 1 load image list info
+    img_infos = collect_image_infos(args.img_path, args.exclude_extensions)
+
+    # 2 convert to coco format data
+    classes = list_from_file(args.classes)
+    coco_info = cvt_to_coco_json(img_infos, classes)
+
+    # 3 dump
+    save_dir = os.path.join(args.img_path, '..', 'annotations')
+    mkdir_or_exist(save_dir)
+    save_path = os.path.join(save_dir, args.out)
+    dump(coco_info, save_path)
+    print(f'save json file: {save_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 828ae1f8e2..cc4b6aedb1 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,10 +1,14 @@
-docutils==0.16.0
+docutils==0.18.1
 einops
+modelindex
 myst-parser
 opencv-python
 -e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 scipy
-sphinx==4.0.2
+sphinx==6.1.3
+sphinx-notfound-page
+sphinx-tabs
 sphinx_copybutton
 sphinx_markdown_tables
-sphinx_rtd_theme==0.5.2
+sphinxcontrib-jquery
+tabulate
diff --git a/requirements/optional.txt b/requirements/optional.txt
index d2204c6441..ff4985ca37 100644
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
@@ -1,10 +1,10 @@
 av>=9.0
 future
-fvcore
 imgaug
 librosa
 lmdb
 moviepy
+openai-clip
 packaging
 pims
 PyTurboJPEG
diff --git a/setup.cfg b/setup.cfg
index 19a6462ca7..c1cc13df6e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,3 +19,7 @@ known_first_party = mmaction
 known_third_party = cv2,decord,einops,joblib,matplotlib,mmcv,numpy,pandas,pytest,pytorch_sphinx_theme,scipy,seaborn,titlecase,torch,webcolors
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
+
+[flake8]
+per-file-ignores =
+    mmaction/configs/*:F401,F403,F405
diff --git a/setup.py b/setup.py
index c8a8f8e0f2..4776e54145 100644
--- a/setup.py
+++ b/setup.py
@@ -119,7 +119,7 @@ def add_mim_extension():
     else:
         return
 
-    filenames = ['tools', 'configs', 'model-index.yml']
+    filenames = ['tools', 'configs', 'model-index.yml', 'dataset-index.yml']
     repo_path = osp.dirname(__file__)
     mim_path = osp.join(repo_path, 'mmaction', '.mim')
     os.makedirs(mim_path, exist_ok=True)
diff --git a/tests/apis/test_inference.py b/tests/apis/test_inference.py
index 6ecd89cb67..1b004943f7 100644
--- a/tests/apis/test_inference.py
+++ b/tests/apis/test_inference.py
@@ -1,13 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 from pathlib import Path
+from tempfile import TemporaryDirectory
 from unittest import TestCase
 
 import torch
+from mmengine.testing import assert_dict_has_keys
 from parameterized import parameterized
 
-from mmaction.apis import inference_recognizer, init_recognizer
+from mmaction.apis import (detection_inference, inference_recognizer,
+                           init_recognizer, pose_inference)
 from mmaction.structures import ActionDataSample
+from mmaction.utils import frame_extract, get_str_type
 
 
 class TestInference(TestCase):
@@ -55,7 +59,7 @@ def test_inference_recognizer(self, config, video_path, devices):
             model = init_recognizer(config_file, device=device)
 
             for ops in model.cfg.test_pipeline:
-                if ops['type'] in ('TenCrop', 'ThreeCrop'):
+                if get_str_type(ops['type']) in ('TenCrop', 'ThreeCrop'):
                     # Use CenterCrop to reduce memory in order to pass CI
                     ops['type'] = 'CenterCrop'
 
@@ -63,3 +67,89 @@ def test_inference_recognizer(self, config, video_path, devices):
 
             self.assertIsInstance(result, ActionDataSample)
             self.assertTrue(result.pred_scores.item.shape, (400, ))
+
+    def test_detection_inference(self):
+        from mmdet.apis import init_detector
+        from mmdet.structures import DetDataSample
+
+        for device in ('cpu', 'cuda'):
+            if device == 'cuda' and not torch.cuda.is_available():
+                # Skip the test if cuda is required but unavailable
+                continue
+            project_dir = osp.abspath(osp.dirname(osp.dirname(__file__)))
+            project_dir = osp.join(project_dir, '..')
+            det_config = 'demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'  # noqa: E501
+            det_ckpt = 'http://download.openmmlab.com/mmdetection/' \
+                       'v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'  # noqa: E501
+            video_path = 'demo/demo_skeleton.mp4'
+            video_path = osp.join(project_dir, video_path)
+            config_file = osp.join(project_dir, det_config)
+            with TemporaryDirectory() as tmpdir:
+                frm_paths, _ = frame_extract(video_path, out_dir=tmpdir)
+                # skip remaining frames to speed up ut
+                frm_paths = frm_paths[:10]
+                results, data_samples = detection_inference(
+                    config_file, det_ckpt, frm_paths, device=device)
+                self.assertTrue(results[0].shape, (4, ))
+                self.assertIsInstance(data_samples[0], DetDataSample)
+                # test with_score
+                results, data_samples = detection_inference(
+                    config_file,
+                    det_ckpt,
+                    frm_paths,
+                    with_score=True,
+                    device=device)
+                self.assertTrue(results[0].shape, (5, ))
+                # test inference with model object
+                model = init_detector(
+                    config=det_config, checkpoint=det_ckpt, device=device)
+                results, data_samples = detection_inference(
+                    model, None, frm_paths, device=device)
+                self.assertTrue(results[0].shape, (4, ))
+                self.assertIsInstance(data_samples[0], DetDataSample)
+
+    def test_pose_inference(self):
+        from mmpose.apis import init_model
+        from mmpose.structures import PoseDataSample
+
+        for device in ('cpu', 'cuda'):
+            if device == 'cuda' and not torch.cuda.is_available():
+                # Skip the test if cuda is required but unavailable
+                continue
+            project_dir = osp.abspath(osp.dirname(osp.dirname(__file__)))
+            project_dir = osp.join(project_dir, '..')
+            det_config = 'demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py'  # noqa: E501
+            det_ckpt = 'http://download.openmmlab.com/mmdetection/' \
+                       'v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'  # noqa: E501
+            pose_config = 'demo/demo_configs/' \
+                          'td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'
+            pose_ckpt = 'https://download.openmmlab.com/mmpose/top_down/' \
+                        'hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'
+            video_path = 'demo/demo_skeleton.mp4'
+            video_path = osp.join(project_dir, video_path)
+            pose_config = osp.join(project_dir, pose_config)
+            with TemporaryDirectory() as tmpdir:
+                frm_paths, _ = frame_extract(video_path, out_dir=tmpdir)
+                # skip remaining frames to speed up ut
+                frm_paths = frm_paths[:10]
+                det_results, _ = detection_inference(
+                    det_config, det_ckpt, frm_paths, device=device)
+
+                results, data_samples = pose_inference(
+                    pose_config,
+                    pose_ckpt,
+                    frm_paths,
+                    det_results,
+                    device=device)
+                assert_dict_has_keys(results[0], ('keypoints', 'bbox_scores',
+                                                  'bboxes', 'keypoint_scores'))
+                self.assertIsInstance(data_samples[0], PoseDataSample)
+
+                # test inference with model object
+                model = init_model(
+                    config=pose_config, checkpoint=pose_ckpt, device=device)
+                results, data_samples = pose_inference(
+                    model, None, frm_paths, det_results, device=device)
+                assert_dict_has_keys(results[0], ('keypoints', 'bbox_scores',
+                                                  'bboxes', 'keypoint_scores'))
+                self.assertIsInstance(data_samples[0], PoseDataSample)
diff --git a/tests/data/annotations/video_text_test_list.json b/tests/data/annotations/video_text_test_list.json
new file mode 100644
index 0000000000..99e968fe88
--- /dev/null
+++ b/tests/data/annotations/video_text_test_list.json
@@ -0,0 +1 @@
+{"test.mp4": ["A person is cleaning a swimming pool", "A person is using a cleaning machine to clean the swimming pool"]}
\ No newline at end of file
diff --git a/tests/data/eval_multisports/data_samples.pkl b/tests/data/eval_multisports/data_samples.pkl
new file mode 100644
index 0000000000..f1341c9bdf
Binary files /dev/null and b/tests/data/eval_multisports/data_samples.pkl differ
diff --git a/tests/data/eval_multisports/gt.pkl b/tests/data/eval_multisports/gt.pkl
new file mode 100644
index 0000000000..13bc68d0a1
Binary files /dev/null and b/tests/data/eval_multisports/gt.pkl differ
diff --git a/tests/data/multisports_dataset/multisports_proposals_sample.pkl b/tests/data/multisports_dataset/multisports_proposals_sample.pkl
new file mode 100644
index 0000000000..524b854599
Binary files /dev/null and b/tests/data/multisports_dataset/multisports_proposals_sample.pkl differ
diff --git a/tests/data/multisports_dataset/multisports_sample.csv b/tests/data/multisports_dataset/multisports_sample.csv
new file mode 100644
index 0000000000..d6bbe51e8a
--- /dev/null
+++ b/tests/data/multisports_dataset/multisports_sample.csv
@@ -0,0 +1,9 @@
+aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,377,0.706,0.439,0.794,0.811,11,0
+aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,378,0.689,0.438,0.794,0.804,11,0
+aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,379,0.672,0.419,0.802,0.797,11,0
+aerobic_gymnastics/v_aqMgwPExjD0_c001.mp4,380,0.680,0.361,0.791,0.783,11,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,443,0.109,0.669,0.345,0.768,1,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,444,0.112,0.668,0.347,0.767,1,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,445,0.115,0.663,0.350,0.761,1,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,446,0.117,0.644,0.352,0.757,1,0
+aerobic_gymnastics/v_yaKOumdXwbU_c019.mp4,447,0.118,0.636,0.352,0.754,1,0
\ No newline at end of file
diff --git a/tests/datasets/base.py b/tests/datasets/base.py
index 3748925c8d..d859d896e4 100644
--- a/tests/datasets/base.py
+++ b/tests/datasets/base.py
@@ -44,6 +44,8 @@ def setup_class(cls):
                                       'video_test_list.txt')
         cls.video_ann_file_multi_label = osp.join(
             cls.ann_file_prefix, 'video_test_list_multi_label.txt')
+        cls.video_text_ann_file = osp.join(cls.ann_file_prefix,
+                                           'video_text_test_list.json')
         cls.pose_ann_file = osp.join(cls.ann_file_prefix, 'sample.pkl')
 
         # pipeline configuration
@@ -140,6 +142,17 @@ def setup_class(cls):
             dict(type='OpenCVDecode')
         ]
 
+        cls.video_text_pipeline = [
+            dict(type='OpenCVInit'),
+            dict(
+                type='SampleFrames',
+                clip_len=32,
+                frame_interval=2,
+                num_clips=1),
+            dict(type='OpenCVDecode'),
+            dict(type='CLIPTokenize')
+        ]
+
         cls.hvu_categories = [
             'action', 'attribute', 'concept', 'event', 'object', 'scene'
         ]
diff --git a/tests/datasets/test_ava_dataset.py b/tests/datasets/test_ava_dataset.py
index a75dd512b6..1ae4833858 100644
--- a/tests/datasets/test_ava_dataset.py
+++ b/tests/datasets/test_ava_dataset.py
@@ -4,7 +4,7 @@
 import mmengine
 import numpy as np
 from mmengine.testing import assert_dict_has_keys
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_almost_equal, assert_array_equal
 
 from mmaction.datasets import AVADataset, AVAKineticsDataset
 from mmaction.utils import register_all_modules
@@ -23,7 +23,7 @@ def setup_class(cls):
         cls.proposal_file = osp.join(cls.data_prefix,
                                      'ava_proposals_sample.pkl')
         cls.pipeline = [
-            dict(dict(type='SampleAVAFrames', clip_len=32, frame_interval=2))
+            dict(type='SampleAVAFrames', clip_len=32, frame_interval=2)
         ]
         cls.proposal = mmengine.load(cls.proposal_file)
 
@@ -31,8 +31,8 @@ def test_ava_dataset(self):
         register_all_modules()
         ava_dataset = AVADataset(
             self.ann_file,
-            self.exclude_file,
             self.pipeline,
+            self.exclude_file,
             self.label_file,
             data_prefix={'img': self.data_prefix},
             proposal_file=self.proposal_file)
@@ -40,8 +40,8 @@ def test_ava_dataset(self):
         # custom classes
         ava_dataset = AVADataset(
             self.ann_file,
-            self.exclude_file,
             self.pipeline,
+            self.exclude_file,
             label_file=self.label_file,
             custom_classes=[17, 79],
             num_classes=3,
@@ -55,16 +55,16 @@ def test_ava_dataset(self):
 
         ava_dataset = AVADataset(
             self.ann_file,
-            None,
             self.pipeline,
+            None,
             self.label_file,
             data_prefix={'img': self.data_prefix},
             proposal_file=self.proposal_file)
 
         ava_dataset = AVADataset(
             self.ann_file,
-            None,
             self.pipeline,
+            None,
             self.label_file,
             test_mode=True,
             data_prefix={'img': self.data_prefix},
@@ -84,8 +84,8 @@ def test_ava_pipeline(self):
 
         ava_dataset = AVADataset(
             self.ann_file,
-            self.exclude_file,
             self.pipeline,
+            self.exclude_file,
             self.label_file,
             data_prefix={'img': self.data_prefix},
             proposal_file=self.proposal_file)
@@ -94,7 +94,7 @@ def test_ava_pipeline(self):
 
         assert result['filename_tmpl'] == 'img_{:05}.jpg'
         assert result['modality'] == 'RGB'
-        assert result['start_index'] == 0
+        assert result['start_index'] == 1
         assert result['timestamp_start'] == 900
         assert result['timestamp_end'] == 1800
         assert_array_equal(result['proposals'],
@@ -107,8 +107,8 @@ def test_ava_pipeline(self):
 
         ava_dataset = AVADataset(
             self.ann_file,
-            None,
             self.pipeline,
+            None,
             self.label_file,
             test_mode=True,
             data_prefix={'img': self.data_prefix},
@@ -117,11 +117,130 @@ def test_ava_pipeline(self):
         result = ava_dataset[0]
         assert result['filename_tmpl'] == 'img_{:05}.jpg'
         assert result['modality'] == 'RGB'
-        assert result['start_index'] == 0
+        assert result['start_index'] == 1
         assert result['timestamp_start'] == 900
         assert result['timestamp_end'] == 1800
 
 
+class TestMultiSportsDataset:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_prefix = osp.normpath(
+            osp.join(
+                osp.dirname(__file__), './../data', 'multisports_dataset'))
+        cls.ann_file = osp.join(cls.data_prefix, 'multisports_sample.csv')
+        cls.proposal_file = osp.join(cls.data_prefix,
+                                     'multisports_proposals_sample.pkl')
+        cls.pipeline = [
+            dict(type='DecordInit'),
+            dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
+            dict(type='DecordDecode')
+        ]
+        cls.proposal = mmengine.load(cls.proposal_file)
+
+    def test_multisports_dataset(self):
+        register_all_modules()
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file,
+            use_frames=False,
+            timestamp_start=1,
+            start_index=0,
+            multilabel=False,
+            fps=1)
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            test_mode=True,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file,
+            use_frames=False,
+            timestamp_start=1,
+            start_index=0,
+            multilabel=False,
+            fps=1)
+
+        del ava_dataset
+
+    def test_ava_pipeline(self):
+        register_all_modules()
+        target_keys = [
+            'filename', 'video_id', 'timestamp', 'img_key', 'shot_info', 'fps',
+            'filename_tmpl', 'modality', 'start_index', 'timestamp_start',
+            'timestamp_end', 'proposals', 'scores', 'frame_inds', 'clip_len',
+            'frame_interval', 'gt_labels', 'gt_bboxes', 'entity_ids'
+        ]
+
+        def mock_video_reader(filename):
+            from unittest.mock import MagicMock
+            container = MagicMock()
+            container.__len__.return_value = 100
+            container.get_avg_fps.return_value = 24
+            frame_batch = MagicMock()
+            frame_batch.asnumpy.return_value = np.zeros((32, 720, 1280, 3))
+            container.get_batch.return_value = frame_batch
+            return container
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file,
+            use_frames=False,
+            timestamp_start=1,
+            start_index=0,
+            multilabel=False,
+            fps=1)
+
+        # Mock a decord Container
+        ava_dataset.pipeline.transforms[
+            0]._get_video_reader = mock_video_reader
+        result = ava_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        assert result['modality'] == 'RGB'
+        assert result['fps'] == 1
+        assert result['start_index'] == 0
+
+        h, w = result['imgs'][0].shape[:2]
+        scale_factor = np.array([w, h, w, h])
+        gt_bboxes = np.array([[0.71097612, 0.44144461, 0.79291363, 0.80873633],
+                              [0.19915699, 0.40121613, 0.29834411,
+                               0.79667876]])
+        assert_array_almost_equal(
+            result['proposals'], gt_bboxes * scale_factor, decimal=4)
+        assert_array_almost_equal(result['scores'],
+                                  np.array([0.994165, 0.9902001]))
+
+        assert result['clip_len'] == 32
+        assert result['frame_interval'] == 2
+        assert len(result['frame_inds']) == 32
+
+        ava_dataset = AVADataset(
+            self.ann_file,
+            self.pipeline,
+            test_mode=True,
+            data_prefix={'img': self.data_prefix},
+            proposal_file=self.proposal_file,
+            use_frames=False,
+            timestamp_start=1,
+            start_index=0,
+            multilabel=False,
+            fps=1)
+        # Mock a decord Container
+        ava_dataset.pipeline.transforms[
+            0]._get_video_reader = mock_video_reader
+        # Try to get a sample
+        result = ava_dataset[0]
+        assert result['modality'] == 'RGB'
+        assert result['fps'] == 1
+        assert result['start_index'] == 0
+
+
 class TestAVAKineticsDataset:
 
     @classmethod
diff --git a/tests/datasets/test_video_text_dataset.py b/tests/datasets/test_video_text_dataset.py
new file mode 100644
index 0000000000..43cd26ae0a
--- /dev/null
+++ b/tests/datasets/test_video_text_dataset.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.testing import assert_dict_has_keys
+
+from mmaction.datasets import VideoTextDataset
+from mmaction.utils import register_all_modules
+from .base import BaseTestDataset
+
+
+class TestVideoTextDataset(BaseTestDataset):
+    register_all_modules()
+
+    def test_video_dataset(self):
+        video_dataset = VideoTextDataset(
+            self.video_text_ann_file,
+            self.video_text_pipeline,
+            data_prefix={'video': self.data_prefix},
+            start_index=3)
+        assert len(video_dataset) == 2
+        assert video_dataset.start_index == 3
+
+        video_dataset = VideoTextDataset(
+            self.video_text_ann_file,
+            self.video_text_pipeline,
+            data_prefix={'video': self.data_prefix})
+        assert video_dataset.start_index == 0
+
+    def test_video_pipeline(self):
+        target_keys = ['filename', 'text', 'start_index', 'modality', 'imgs']
+
+        # VideoTextDataset not in test mode
+        video_dataset = VideoTextDataset(
+            self.video_text_ann_file,
+            self.video_text_pipeline,
+            data_prefix={'video': self.data_prefix},
+            test_mode=False)
+        result = video_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
+
+        # VideoTextDataset in test mode
+        video_dataset = VideoTextDataset(
+            self.video_text_ann_file,
+            self.video_text_pipeline,
+            data_prefix={'video': self.data_prefix},
+            test_mode=True)
+        result = video_dataset[0]
+        assert assert_dict_has_keys(result, target_keys)
diff --git a/tests/datasets/transforms/test_formating.py b/tests/datasets/transforms/test_formating.py
index 8e741c24e5..4668732746 100644
--- a/tests/datasets/transforms/test_formating.py
+++ b/tests/datasets/transforms/test_formating.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pytest
 import torch
-from mmengine.structures import InstanceData, LabelData
+from mmengine.structures import InstanceData
 from mmengine.testing import assert_dict_has_keys
 from numpy.testing import assert_array_equal
 
@@ -22,62 +22,77 @@
 class TestPackActionInputs(unittest.TestCase):
 
     def test_transform(self):
+        # none input
+        with self.assertRaises(ValueError):
+            results = PackActionInputs()(dict())
+
         # keypoint input
-        results = dict(keypoint=np.random.randn(1, 2, 300, 17, 3), label=1)
+        results = dict(keypoint=np.random.randn(2, 300, 17, 3), label=1)
         transform = PackActionInputs()
         results = transform(results)
         self.assertIn('inputs', results)
         self.assertIn('data_samples', results)
         self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertEqual(results['inputs'].shape, (2, 300, 17, 3))
         self.assertEqual(results['data_samples'].gt_labels.item,
                          torch.LongTensor([1]))
 
-        # audio input
-        data = dict(
-            audios=np.random.randn(3, 1, 128, 80),
-            label=[1],
-            filename='test.txt')
+        # heatmap_imgs input
+        results = dict(heatmap_imgs=np.random.randn(2, 17, 56, 56), label=1)
+        transform = PackActionInputs()
+        results = transform(results)
+        self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertEqual(results['inputs'].shape, (2, 17, 56, 56))
+        self.assertEqual(results['data_samples'].gt_labels.item,
+                         torch.LongTensor([1]))
 
-        cfg = dict(type='PackActionInputs')
-        transform = TRANSFORMS.build(cfg)
-        results = transform(copy.deepcopy(data))
+        # audios input
+        results = dict(audios=np.random.randn(3, 1, 128, 80), label=[1])
+        transform = PackActionInputs()
+        results = transform(results)
         self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertEqual(results['inputs'].shape, (3, 1, 128, 80))
         self.assertIsInstance(results['inputs'], torch.Tensor)
-        # img input with label
+
+        # text input
+        results = dict(text=np.random.randn(77))
+        transform = PackActionInputs()
+        results = transform(results)
+        self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertEqual(results['inputs'].shape, (77, ))
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+
+        # imgs input with label
         data = dict(
-            imgs=np.random.randn(256, 256, 3),
+            imgs=np.random.randn(2, 256, 256, 3),
             label=[1],
             filename='test.txt',
             original_shape=(256, 256, 3),
             img_shape=(256, 256, 3),
-            pad_shape=(256, 256, 3),
             flip_direction='vertical')
 
-        cfg = dict(type='PackActionInputs')
-        transform = TRANSFORMS.build(cfg)
+        transform = PackActionInputs()
         results = transform(copy.deepcopy(data))
         self.assertIn('inputs', results)
-        self.assertIsInstance(results['inputs'], torch.Tensor)
         self.assertIn('data_samples', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
         self.assertIsInstance(results['data_samples'], ActionDataSample)
-        self.assertIn('img_shape', results['data_samples'].metainfo_keys())
-        self.assertIsInstance(results['data_samples'].gt_labels, LabelData)
+        self.assertEqual(results['data_samples'].img_shape, (256, 256, 3))
+        self.assertEqual(results['data_samples'].gt_labels.item,
+                         torch.LongTensor([1]))
 
         # Test grayscale image
         data['imgs'] = data['imgs'].mean(-1)
         results = transform(copy.deepcopy(data))
         self.assertIn('inputs', results)
         self.assertIsInstance(results['inputs'], torch.Tensor)
-        self.assertEqual(results['inputs'].shape, (256, 256))
+        self.assertEqual(results['inputs'].shape, (2, 256, 256))
 
-        # Test without `img` and `gt_label`
-        del data['imgs']
-        del data['label']
-        with self.assertRaises(ValueError):
-            results = transform(copy.deepcopy(data))
-            self.assertNotIn('gt_labels', results['data_samples'])
-
-        # img input with gt_bboxes
+        # imgs input with gt_bboxes
         data = dict(
             imgs=np.random.randn(256, 256, 3),
             gt_bboxes=np.array([[0, 0, 340, 224]]),
@@ -85,8 +100,7 @@ def test_transform(self):
             proposals=np.array([[0, 0, 340, 224]]),
             filename='test.txt')
 
-        cfg = dict(type='PackActionInputs')
-        transform = TRANSFORMS.build(cfg)
+        transform = PackActionInputs()
         results = transform(copy.deepcopy(data))
         self.assertIn('inputs', results)
         self.assertIsInstance(results['inputs'], torch.Tensor)
@@ -96,6 +110,18 @@ def test_transform(self):
                               InstanceData)
         self.assertIsInstance(results['data_samples'].proposals, InstanceData)
 
+        # imgs and text input
+        data = dict(
+            imgs=np.random.randn(2, 256, 256, 3), text=np.random.randn(77))
+
+        transform = PackActionInputs(collect_keys=('imgs', 'text'))
+        results = transform(copy.deepcopy(data))
+        self.assertIn('inputs', results)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['inputs'], dict)
+        self.assertEqual(results['inputs']['imgs'].shape, (2, 256, 256, 3))
+        self.assertEqual(results['inputs']['text'].shape, (77, ))
+
     def test_repr(self):
         cfg = dict(
             type='PackActionInputs', meta_keys=['flip_direction', 'img_shape'])
@@ -114,7 +140,7 @@ def test_transform(self):
             gt_bbox=np.array([[0.1, 0.3], [0.375, 0.625]]),
             filename='test.txt')
 
-        cfg = dict(type='PackLocalizationInputs', keys='gt_bbox')
+        cfg = dict(type='PackLocalizationInputs', keys=('gt_bbox', ))
         transform = TRANSFORMS.build(cfg)
         results = transform(copy.deepcopy(data))
         self.assertIn('inputs', results)
diff --git a/tests/datasets/transforms/test_sampling.py b/tests/datasets/transforms/test_sampling.py
index f4a5e457bd..364972ea39 100644
--- a/tests/datasets/transforms/test_sampling.py
+++ b/tests/datasets/transforms/test_sampling.py
@@ -561,7 +561,7 @@ def test_untrim_sample_frames(self):
             label=1)
         video_result = copy.deepcopy(self.video_results)
 
-        config = dict(clip_len=1, frame_interval=16)  # , start_index=0)
+        config = dict(clip_len=1, clip_interval=16)  # , start_index=0)
         sample_frames = UntrimmedSampleFrames(**config)
         sample_frames_results = sample_frames(frame_result)
         assert assert_dict_has_keys(sample_frames_results, target_keys)
@@ -570,9 +570,10 @@ def test_untrim_sample_frames(self):
                            np.array([8, 24, 40, 56, 72, 88]))
         assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
                                        f'clip_len={1}, '
-                                       f'frame_interval={16})')
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={1})')
 
-        config = dict(clip_len=1, frame_interval=16)  # , start_index=0)
+        config = dict(clip_len=1, clip_interval=16)  # , start_index=0)
         sample_frames = UntrimmedSampleFrames(**config)
         sample_frames_results = sample_frames(video_result)
         assert assert_dict_has_keys(sample_frames_results, target_keys)
@@ -581,9 +582,10 @@ def test_untrim_sample_frames(self):
         assert_array_equal(sample_frames_results['frame_inds'], frame_inds)
         assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
                                        f'clip_len={1}, '
-                                       f'frame_interval={16})')
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={1})')
 
-        config = dict(clip_len=1, frame_interval=16)
+        config = dict(clip_len=1, clip_interval=16)
         sample_frames = UntrimmedSampleFrames(**config)
         frame_result_ = copy.deepcopy(frame_result)
         frame_result_['start_index'] = 1
@@ -594,9 +596,10 @@ def test_untrim_sample_frames(self):
                            np.array([8, 24, 40, 56, 72, 88]) + 1)
         assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
                                        f'clip_len={1}, '
-                                       f'frame_interval={16})')
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={1})')
 
-        config = dict(clip_len=3, frame_interval=16)  # , start_index=0)
+        config = dict(clip_len=3, clip_interval=16)  # , start_index=0)
         sample_frames = UntrimmedSampleFrames(**config)
         sample_frames_results = sample_frames(frame_result)
         assert assert_dict_has_keys(sample_frames_results, target_keys)
@@ -609,7 +612,25 @@ def test_untrim_sample_frames(self):
             ]))
         assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
                                        f'clip_len={3}, '
-                                       f'frame_interval={16})')
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={1})')
+
+        config = dict(
+            clip_len=3, clip_interval=16, frame_interval=4)  # , start_index=0)
+        sample_frames = UntrimmedSampleFrames(**config)
+        sample_frames_results = sample_frames(frame_result)
+        assert assert_dict_has_keys(sample_frames_results, target_keys)
+        assert len(sample_frames_results['frame_inds']) == 18
+        assert_array_equal(
+            sample_frames_results['frame_inds'],
+            np.array([
+                4, 8, 12, 20, 24, 28, 36, 40, 44, 52, 56, 60, 68, 72, 76, 84,
+                88, 92
+            ]))
+        assert repr(sample_frames) == (f'{sample_frames.__class__.__name__}('
+                                       f'clip_len={3}, '
+                                       f'clip_interval={16}, '
+                                       f'frame_interval={4})')
 
     def test_sample_ava_frames(self):
         target_keys = [
diff --git a/tests/datasets/transforms/test_text_transforms.py b/tests/datasets/transforms/test_text_transforms.py
new file mode 100644
index 0000000000..f8018bfc50
--- /dev/null
+++ b/tests/datasets/transforms/test_text_transforms.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmaction.datasets.transforms import CLIPTokenize
+
+
+class TestTextTransforms:
+
+    @staticmethod
+    def test_clip_tokenize():
+        results = {'text': 'Hello, MMAction2 2.0!'}
+        clip_tokenize = CLIPTokenize()
+        results = clip_tokenize(results)
+        assert results['text'].shape[0] == 77
+        assert results['text'].dtype == torch.int32
diff --git a/tests/evaluation/metrics/test_acc_metric.py b/tests/evaluation/metrics/test_acc_metric.py
index 295c301ab6..aeb6fb2cb0 100644
--- a/tests/evaluation/metrics/test_acc_metric.py
+++ b/tests/evaluation/metrics/test_acc_metric.py
@@ -1,10 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import platform
 from unittest import TestCase
 
 import numpy as np
+import pytest
 import torch
+from mmengine import load
+from numpy.testing import assert_array_almost_equal
 
-from mmaction.evaluation import AccMetric, ConfusionMatrix
+from mmaction.evaluation import AccMetric, ConfusionMatrix, MultiSportsMetric
+from mmaction.evaluation.functional import ava_eval
 from mmaction.registry import METRICS
 from mmaction.structures import ActionDataSample
 
@@ -54,6 +60,37 @@ def test_acc_metric():
     assert eval_results['mmit_mean_average_precision'] == 1.0
 
 
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Multiprocess Fail')
+def test_ava_detection():
+    data_prefix = osp.normpath(
+        osp.join(osp.dirname(__file__), '../../data/eval_detection'))
+
+    gt_path = osp.join(data_prefix, 'gt.csv')
+    result_path = osp.join(data_prefix, 'pred.csv')
+    label_map = osp.join(data_prefix, 'action_list.txt')
+
+    # eval bbox
+    detection = ava_eval(result_path, 'mAP', label_map, gt_path, None)
+    assert_array_almost_equal(detection['overall'], 0.09385522)
+
+
+def test_multisport_detection():
+    data_prefix = osp.normpath(
+        osp.join(osp.dirname(__file__), '../../data/eval_multisports'))
+
+    gt_path = osp.join(data_prefix, 'gt.pkl')
+    result_path = osp.join(data_prefix, 'data_samples.pkl')
+
+    result_datasamples = load(result_path)
+    metric = MultiSportsMetric(gt_path)
+    metric.process(None, result_datasamples)
+    eval_result = metric.compute_metrics(metric.results)
+    assert eval_result['frameAP'] == 83.6506
+    assert eval_result['v_map@0.2'] == 37.5
+    assert eval_result['v_map@0.5'] == 37.5
+    assert eval_result['v_map_0.10:0.90'] == 29.1667
+
+
 class TestConfusionMatrix(TestCase):
 
     def test_evaluate(self):
diff --git a/tests/evaluation/metrics/test_retrieval_metric.py b/tests/evaluation/metrics/test_retrieval_metric.py
new file mode 100644
index 0000000000..cb1f1c72ba
--- /dev/null
+++ b/tests/evaluation/metrics/test_retrieval_metric.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.evaluation.metrics import RetrievalMetric
+
+
+def generate_data(num_samples=5, feat_dim=10, random_label=False):
+    data_batch = []
+    data_samples = []
+    for i in range(num_samples):
+        if random_label:
+            video_feature = torch.randn(feat_dim)
+            text_feature = torch.randn(feat_dim)
+        else:
+            video_feature = torch.randn(feat_dim)
+            text_feature = video_feature.clone()
+
+        data_sample = dict(
+            features=dict(
+                video_feature=video_feature, text_feature=text_feature))
+        data_samples.append(data_sample)
+    return data_batch, data_samples
+
+
+def test_acc_metric():
+    with pytest.raises(ValueError):
+        RetrievalMetric(metric_list='R100')
+
+    num_samples = 20
+    metric = RetrievalMetric()
+    data_batch, predictions = generate_data(
+        num_samples=num_samples, random_label=True)
+    metric.process(data_batch, predictions)
+    eval_results = metric.compute_metrics(metric.results)
+    assert 0.0 <= eval_results['R1'] <= eval_results['R5'] <= eval_results[
+        'R10'] <= 100.0
+    assert 0.0 <= eval_results['MdR'] <= num_samples
+    assert 0.0 <= eval_results['MnR'] <= num_samples
+
+    metric.results.clear()
+
+    data_batch, predictions = generate_data(
+        num_samples=num_samples, random_label=False)
+    metric.process(data_batch, predictions)
+    eval_results = metric.compute_metrics(metric.results)
+    assert eval_results['R1'] == eval_results['R5'] == eval_results[
+        'R10'] == 100.0
+    assert eval_results['MdR'] == eval_results['MnR'] == 1.0
diff --git a/tests/models/backbones/test_mobilenet_v2.py b/tests/models/backbones/test_mobilenet_v2.py
index 4ddb8e8091..d9d862e8f8 100644
--- a/tests/models/backbones/test_mobilenet_v2.py
+++ b/tests/models/backbones/test_mobilenet_v2.py
@@ -30,8 +30,8 @@ def is_block(modules):
 
     with pytest.raises(TypeError):
         # pretrained must be a string path
-        model = MobileNetV2(pretrained=0)
-        model.init_weights()
+        model = MobileNetV2()
+        model.init_weights(pretrained=0)
 
     with pytest.raises(ValueError):
         # frozen_stages must in range(1, 9)
diff --git a/tests/models/backbones/test_mobilenet_v2_tsm.py b/tests/models/backbones/test_mobilenet_v2_tsm.py
index 34ad6b86a9..b0637be1d5 100644
--- a/tests/models/backbones/test_mobilenet_v2_tsm.py
+++ b/tests/models/backbones/test_mobilenet_v2_tsm.py
@@ -16,7 +16,7 @@ def test_mobilenetv2_tsm_backbone():
     imgs = generate_backbone_demo_inputs(input_shape)
 
     # mobilenetv2_tsm with width_mult = 1.0
-    mobilenetv2_tsm = MobileNetV2TSM()
+    mobilenetv2_tsm = MobileNetV2TSM(pretrained='mmcls://mobilenet_v2')
     mobilenetv2_tsm.init_weights()
     for cur_module in mobilenetv2_tsm.modules():
         if isinstance(cur_module, InvertedResidual) and \
@@ -33,13 +33,13 @@ def test_mobilenetv2_tsm_backbone():
     assert feat.shape == torch.Size([8, 1280, 2, 2])
 
     # mobilenetv2 with widen_factor = 0.5 forword
-    mobilenetv2_tsm_05 = MobileNetV2TSM(widen_factor=0.5)
+    mobilenetv2_tsm_05 = MobileNetV2TSM(widen_factor=0.5, pretrained2d=False)
     mobilenetv2_tsm_05.init_weights()
     feat = mobilenetv2_tsm_05(imgs)
     assert feat.shape == torch.Size([8, 1280, 2, 2])
 
     # mobilenetv2 with widen_factor = 1.5 forword
-    mobilenetv2_tsm_15 = MobileNetV2TSM(widen_factor=1.5)
+    mobilenetv2_tsm_15 = MobileNetV2TSM(widen_factor=1.5, pretrained2d=False)
     mobilenetv2_tsm_15.init_weights()
     feat = mobilenetv2_tsm_15(imgs)
     assert feat.shape == torch.Size([8, 1920, 2, 2])
diff --git a/tests/models/backbones/test_resnet_tsm.py b/tests/models/backbones/test_resnet_tsm.py
index ddac948354..db22aaf3cb 100644
--- a/tests/models/backbones/test_resnet_tsm.py
+++ b/tests/models/backbones/test_resnet_tsm.py
@@ -1,113 +1,130 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+from unittest import TestCase
 
 import pytest
 import torch
 import torch.nn as nn
 
 from mmaction.models import ResNetTSM
-from mmaction.models.backbones.resnet_tsm import NL3DWrapper
+from mmaction.models.backbones.resnet import Bottleneck
+from mmaction.models.backbones.resnet_tsm import NL3DWrapper, TemporalShift
 from mmaction.testing import generate_backbone_demo_inputs
 
 
-def test_resnet_tsm_backbone():
-    """Test resnet_tsm backbone."""
-    with pytest.raises(NotImplementedError):
-        # shift_place must be block or blockres
-        resnet_tsm_50_block = ResNetTSM(50, shift_place='Block')
+class Test_ResNet_TSM(TestCase):
+
+    def setUp(self):
+        input_shape = (8, 3, 64, 64)
+        self.imgs = generate_backbone_demo_inputs(input_shape)
+
+    def test_init(self):
+        with pytest.raises(NotImplementedError):
+            # shift_place must be block or blockres
+            resnet_tsm_50_block = ResNetTSM(50, shift_place='Block')
+            resnet_tsm_50_block.init_weights()
+
+    def test_init_from_scratch(self):
+        resnet_tsm_50 = ResNetTSM(50, pretrained=None, pretrained2d=False)
+        resnet_tsm_50.init_weights()
+
+    def test_resnet_tsm_temporal_shift_blockres(self):
+        # resnet_tsm with depth 50
+        resnet_tsm_50 = ResNetTSM(50, pretrained='torchvision://resnet50')
+        resnet_tsm_50.init_weights()
+        for layer_name in resnet_tsm_50.res_layers:
+            layer = getattr(resnet_tsm_50, layer_name)
+            blocks = list(layer.children())
+            for block in blocks:
+                assert isinstance(block.conv1.conv, TemporalShift)
+                assert block.conv1.conv.num_segments == resnet_tsm_50.num_segments  # noqa: E501
+                assert block.conv1.conv.shift_div == resnet_tsm_50.shift_div
+                assert isinstance(block.conv1.conv.net, nn.Conv2d)
+        feat = resnet_tsm_50(self.imgs)
+        assert feat.shape == torch.Size([8, 2048, 2, 2])
+
+    def test_resnet_tsm_temporal_shift_block(self):
+        # resnet_tsm with depth 50, no pretrained, shift_place is block
+        resnet_tsm_50_block = ResNetTSM(
+            50, shift_place='block', pretrained='torchvision://resnet50')
         resnet_tsm_50_block.init_weights()
-
-    from mmaction.models.backbones.resnet import Bottleneck
-    from mmaction.models.backbones.resnet_tsm import TemporalShift
-
-    input_shape = (8, 3, 64, 64)
-    imgs = generate_backbone_demo_inputs(input_shape)
-
-    # resnet_tsm with depth 50
-    resnet_tsm_50 = ResNetTSM(50)
-    resnet_tsm_50.init_weights()
-    for layer_name in resnet_tsm_50.res_layers:
-        layer = getattr(resnet_tsm_50, layer_name)
-        blocks = list(layer.children())
-        for block in blocks:
-            assert isinstance(block.conv1.conv, TemporalShift)
-            assert block.conv1.conv.num_segments == resnet_tsm_50.num_segments
-            assert block.conv1.conv.shift_div == resnet_tsm_50.shift_div
-            assert isinstance(block.conv1.conv.net, nn.Conv2d)
-
-    # resnet_tsm with depth 50, no pretrained, shift_place is block
-    resnet_tsm_50_block = ResNetTSM(50, shift_place='block')
-    resnet_tsm_50_block.init_weights()
-    for layer_name in resnet_tsm_50_block.res_layers:
-        layer = getattr(resnet_tsm_50_block, layer_name)
-        blocks = list(layer.children())
-        for block in blocks:
-            assert isinstance(block, TemporalShift)
-            assert block.num_segments == resnet_tsm_50_block.num_segments
-            assert block.num_segments == resnet_tsm_50_block.num_segments
-            assert block.shift_div == resnet_tsm_50_block.shift_div
-            assert isinstance(block.net, Bottleneck)
-
-    # resnet_tsm with depth 50, no pretrained, use temporal_pool
-    resnet_tsm_50_temporal_pool = ResNetTSM(50, temporal_pool=True)
-    resnet_tsm_50_temporal_pool.init_weights()
-    for layer_name in resnet_tsm_50_temporal_pool.res_layers:
-        layer = getattr(resnet_tsm_50_temporal_pool, layer_name)
-        blocks = list(layer.children())
-
-        if layer_name == 'layer2':
-            assert len(blocks) == 2
-            assert isinstance(blocks[1], nn.MaxPool3d)
-            blocks = copy.deepcopy(blocks[0])
-
-        for block in blocks:
-            assert isinstance(block.conv1.conv, TemporalShift)
-            if layer_name == 'layer1':
-                assert block.conv1.conv.num_segments == \
-                       resnet_tsm_50_temporal_pool.num_segments
-            else:
-                assert block.conv1.conv.num_segments == \
-                       resnet_tsm_50_temporal_pool.num_segments // 2
-            assert block.conv1.conv.shift_div == resnet_tsm_50_temporal_pool.shift_div  # noqa: E501
-            assert isinstance(block.conv1.conv.net, nn.Conv2d)
-
-    # resnet_tsm with non-local module
-    non_local_cfg = dict(
-        sub_sample=True,
-        use_scale=False,
-        norm_cfg=dict(type='BN3d', requires_grad=True),
-        mode='embedded_gaussian')
-    non_local = ((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0))
-    resnet_tsm_nonlocal = ResNetTSM(
-        50, non_local=non_local, non_local_cfg=non_local_cfg)
-    resnet_tsm_nonlocal.init_weights()
-    for layer_name in ['layer2', 'layer3']:
-        layer = getattr(resnet_tsm_nonlocal, layer_name)
-        for i, _ in enumerate(layer):
-            if i % 2 == 0:
-                assert isinstance(layer[i], NL3DWrapper)
-
-    resnet_tsm_50_full = ResNetTSM(
-        50,
-        non_local=non_local,
-        non_local_cfg=non_local_cfg,
-        temporal_pool=True)
-    resnet_tsm_50_full.init_weights()
-
-    # TSM forword
-    feat = resnet_tsm_50(imgs)
-    assert feat.shape == torch.Size([8, 2048, 2, 2])
-
-    # TSM with non-local forward
-    feat = resnet_tsm_nonlocal(imgs)
-    assert feat.shape == torch.Size([8, 2048, 2, 2])
-
-    # TSM with temporal pool forward
-    feat = resnet_tsm_50_temporal_pool(imgs)
-    assert feat.shape == torch.Size([4, 2048, 2, 2])
-
-    # TSM with temporal pool + non-local forward
-    input_shape = (16, 3, 32, 32)
-    imgs = generate_backbone_demo_inputs(input_shape)
-    feat = resnet_tsm_50_full(imgs)
-    assert feat.shape == torch.Size([8, 2048, 1, 1])
+        for layer_name in resnet_tsm_50_block.res_layers:
+            layer = getattr(resnet_tsm_50_block, layer_name)
+            blocks = list(layer.children())
+            for block in blocks:
+                assert isinstance(block, TemporalShift)
+                assert block.num_segments == resnet_tsm_50_block.num_segments
+                assert block.num_segments == resnet_tsm_50_block.num_segments
+                assert block.shift_div == resnet_tsm_50_block.shift_div
+                assert isinstance(block.net, Bottleneck)
+
+    def test_resnet_tsm_temporal_pool(self):
+        # resnet_tsm with depth 50, no pretrained, use temporal_pool
+        resnet_tsm_50_temporal_pool = ResNetTSM(
+            50, temporal_pool=True, pretrained='torchvision://resnet50')
+        resnet_tsm_50_temporal_pool.init_weights()
+        for layer_name in resnet_tsm_50_temporal_pool.res_layers:
+            layer = getattr(resnet_tsm_50_temporal_pool, layer_name)
+            blocks = list(layer.children())
+
+            if layer_name == 'layer2':
+                assert len(blocks) == 2
+                assert isinstance(blocks[1], nn.MaxPool3d)
+                blocks = copy.deepcopy(blocks[0])
+
+            for block in blocks:
+                assert isinstance(block.conv1.conv, TemporalShift)
+                if layer_name == 'layer1':
+                    assert block.conv1.conv.num_segments == \
+                        resnet_tsm_50_temporal_pool.num_segments
+                else:
+                    assert block.conv1.conv.num_segments == \
+                        resnet_tsm_50_temporal_pool.num_segments // 2
+                assert block.conv1.conv.shift_div == resnet_tsm_50_temporal_pool.shift_div  # noqa: E501
+                assert isinstance(block.conv1.conv.net, nn.Conv2d)
+
+        feat = resnet_tsm_50_temporal_pool(self.imgs)
+        assert feat.shape == torch.Size([4, 2048, 2, 2])
+
+    def test_resnet_tsm_non_local(self):
+        # resnet_tsm with non-local module
+        non_local_cfg = dict(
+            sub_sample=True,
+            use_scale=False,
+            norm_cfg=dict(type='BN3d', requires_grad=True),
+            mode='embedded_gaussian')
+        non_local = ((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0))
+        resnet_tsm_nonlocal = ResNetTSM(
+            50,
+            non_local=non_local,
+            non_local_cfg=non_local_cfg,
+            pretrained='torchvision://resnet50')
+        resnet_tsm_nonlocal.init_weights()
+        for layer_name in ['layer2', 'layer3']:
+            layer = getattr(resnet_tsm_nonlocal, layer_name)
+            for i, _ in enumerate(layer):
+                if i % 2 == 0:
+                    assert isinstance(layer[i], NL3DWrapper)
+
+        feat = resnet_tsm_nonlocal(self.imgs)
+        assert feat.shape == torch.Size([8, 2048, 2, 2])
+
+    def test_resnet_tsm_full(self):
+        non_local_cfg = dict(
+            sub_sample=True,
+            use_scale=False,
+            norm_cfg=dict(type='BN3d', requires_grad=True),
+            mode='embedded_gaussian')
+        non_local = ((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0))
+        resnet_tsm_50_full = ResNetTSM(
+            50,
+            pretrained='torchvision://resnet50',
+            non_local=non_local,
+            non_local_cfg=non_local_cfg,
+            temporal_pool=True)
+        resnet_tsm_50_full.init_weights()
+
+        input_shape = (16, 3, 32, 32)
+        imgs = generate_backbone_demo_inputs(input_shape)
+        feat = resnet_tsm_50_full(imgs)
+        assert feat.shape == torch.Size([8, 2048, 1, 1])
diff --git a/tests/models/data_preprocessors/test_data_preprocessor.py b/tests/models/data_preprocessors/test_data_preprocessor.py
index a4a3d851d7..5fe3e8f663 100644
--- a/tests/models/data_preprocessors/test_data_preprocessor.py
+++ b/tests/models/data_preprocessors/test_data_preprocessor.py
@@ -95,3 +95,8 @@ def test_data_preprocessor():
                        (raw_3d_data['inputs'][0] - psr.mean) / psr.std)
     assert_array_equal(data[1]['inputs'][1],
                        (raw_3d_data['inputs'][1] - psr.mean) / psr.std)
+
+    raw_data = generate_dummy_data(2, (77, ))
+    psr = ActionDataPreprocessor(to_float32=False)
+    data = psr(raw_data)
+    assert data['inputs'].dtype == raw_data['inputs'][0].dtype
diff --git a/tests/models/heads/test_feature_head.py b/tests/models/heads/test_feature_head.py
new file mode 100644
index 0000000000..932ed87133
--- /dev/null
+++ b/tests/models/heads/test_feature_head.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+
+from mmaction.models import FeatureHead
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_recognizer_cfg
+from mmaction.utils import register_all_modules
+
+
+class TestFeatureHead(TestCase):
+
+    def test_2d_recognizer(self):
+        register_all_modules()
+        config = get_recognizer_cfg(
+            'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'  # noqa: E501
+        )
+        config.model['backbone']['pretrained'] = None
+        config.model['cls_head'] = dict(
+            type='FeatureHead', average_clips='score')
+
+        recognizer = MODELS.build(config.model)
+
+        input_shape = [3, 3, 32, 32]
+        data_batch = {
+            'inputs': [torch.randint(0, 256, input_shape)],
+            'data_samples': [ActionDataSample().set_gt_labels(2)]
+        }
+        feat = recognizer.test_step(data_batch)
+        assert isinstance(feat, torch.Tensor)
+        assert feat.shape == torch.Size([1, 2048])
+
+    def test_3d_recognizer(self):
+        register_all_modules()
+        config = get_recognizer_cfg(
+            'slowonly/slowonly_r50_8xb16-4x16x1-256e_kinetics400-rgb.py')
+        config.model['backbone']['pretrained'] = None
+        config.model['backbone']['pretrained2d'] = False
+        config.model['cls_head'] = dict(
+            type='FeatureHead', average_clips='score')
+
+        recognizer = MODELS.build(config.model)
+        input_shape = [1, 3, 4, 32, 32]
+        data_batch = {
+            'inputs': [torch.randint(0, 256, input_shape)],
+            'data_samples': [ActionDataSample().set_gt_labels(2)]
+        }
+        feat = recognizer.test_step(data_batch)
+        assert isinstance(feat, torch.Tensor)
+        assert feat.shape == torch.Size([1, 2048])
+
+    def test_3d_backbone(self):
+        with pytest.raises(NotImplementedError):
+            head = FeatureHead(spatial_type='test')
+
+        head = FeatureHead(average_clips='score')
+        x = torch.rand(1, 64, 2, 7, 7)
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64])
+
+        head = FeatureHead(spatial_type=None, average_clips='score')
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64, 7, 7])
+
+        head = FeatureHead(temporal_type=None, average_clips='score')
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64, 2])
+
+        head = FeatureHead(
+            spatial_type=None, temporal_type=None, average_clips='score')
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64, 2, 7, 7])
+
+    def test_slowfast_backbone(self):
+        head = FeatureHead(backbone_name='slowfast', average_clips='score')
+        x_slow = torch.rand(1, 64, 2, 7, 7)
+        x_fast = torch.rand(1, 32, 6, 7, 7)
+        x = (x_slow, x_fast)
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 96])
+
+        head = FeatureHead(
+            backbone_name='slowfast', spatial_type=None, average_clips='score')
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 96, 7, 7])
+
+        with pytest.raises(AssertionError):
+            head = FeatureHead(
+                backbone_name='slowfast',
+                temporal_type=None,
+                average_clips='score')
+            feat = head(x)
+
+    def test_2d_backbone(self):
+        head = FeatureHead(average_clips='score')
+        x = torch.rand(2, 64, 7, 7)
+        with pytest.raises(AssertionError):
+            feat = head(x)
+
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 64])
+
+        x = torch.rand(2, 64, 7, 7)
+        head = FeatureHead(spatial_type=None, average_clips='score')
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 64, 7, 7])
+
+        head = FeatureHead(temporal_type=None, average_clips='score')
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 2, 64])
+
+    def test_tsm_backbone(self):
+        head = FeatureHead(backbone_name='tsm', average_clips='score')
+        x = torch.rand(2, 64, 7, 7)
+        with pytest.raises(AssertionError):
+            feat = head(x)
+        with pytest.raises(AssertionError):
+            feat = head(x, num_segs=2)
+
+        head = FeatureHead(num_segments=2, average_clips='score')
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 64])
+
+        x = torch.rand(2, 64, 7, 7)
+        head = FeatureHead(
+            num_segments=2, spatial_type=None, average_clips='score')
+        feat = head(x, num_segs=2)
+        assert feat.shape == torch.Size([1, 64, 7, 7])
+
+    def test_gcn_backbone(self):
+        # N, M, C, T, V
+        head = FeatureHead(backbone_name='gcn', average_clips='score')
+        x = torch.rand(1, 5, 64, 2, 7)
+        feat = head(x)
+        assert feat.shape == torch.Size([1, 64])
diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py
index 773bc0806f..b40398755b 100644
--- a/tests/models/recognizers/test_recognizer2d.py
+++ b/tests/models/recognizers/test_recognizer2d.py
@@ -1,44 +1,74 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import platform
+from unittest.mock import MagicMock
 
 import pytest
 import torch
+from mmengine.utils import digit_version
 
 from mmaction.registry import MODELS
-from mmaction.testing import (generate_recognizer_demo_inputs,
-                              get_recognizer_cfg)
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_recognizer_cfg
 from mmaction.utils import register_all_modules
 
 
+def train_test_step(cfg, input_shape):
+    recognizer = MODELS.build(cfg.model)
+    num_classes = cfg.model.cls_head.num_classes
+    batch_size = input_shape[0]
+    input_shape = input_shape[1:]
+    data_batch = {
+        'inputs':
+        [torch.randint(0, 256, input_shape) for i in range(batch_size)],
+        'data_samples':
+        [ActionDataSample().set_gt_labels(2) for i in range(batch_size)]
+    }
+
+    # test train_step
+    optim_wrapper = MagicMock()
+    loss_vars = recognizer.train_step(data_batch, optim_wrapper)
+    assert 'loss' in loss_vars
+    assert 'loss_cls' in loss_vars
+    optim_wrapper.update_params.assert_called_once()
+
+    # test test_step
+    with torch.no_grad():
+        predictions = recognizer.test_step(data_batch)
+    score = predictions[0].pred_scores.item
+    assert len(predictions) == batch_size
+    assert score.shape == torch.Size([num_classes])
+    assert torch.min(score) >= 0
+    assert torch.max(score) <= 1
+
+    # test twice sample + 3 crops
+    num_views = input_shape[0] * 2 * 3
+    input_shape = (num_views, *input_shape[1:])
+    data_batch['inputs'] = [torch.randint(0, 256, input_shape)]
+    with torch.no_grad():
+        predictions = recognizer.test_step(data_batch)
+    score = predictions[0].pred_scores.item
+    assert len(predictions) == batch_size
+    assert score.shape == torch.Size([num_classes])
+
+    return loss_vars, predictions
+
+
 def test_tsn():
     register_all_modules()
     config = get_recognizer_cfg(
         'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
     config.model['backbone']['pretrained'] = None
 
-    recognizer = MODELS.build(config.model)
-
     input_shape = (1, 3, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-
-    imgs = demo_inputs['imgs']
-    gt_labels = demo_inputs['gt_labels']
+    train_test_step(config, input_shape)
 
-    losses = recognizer(imgs, gt_labels)
-    assert isinstance(losses, torch.Tensor)
 
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
-
-    # Test forward gradcam
-    recognizer(imgs, gradcam=True)
-    for one_img in img_list:
-        recognizer(one_img, gradcam=True)
-    """
-    TODO
+def test_tsn_mmcls_backbone():
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
+    config.model['backbone']['pretrained'] = None
+    # test mmcls backbone
     mmcls_backbone = dict(
         type='mmcls.ResNeXt',
         depth=101,
@@ -49,109 +79,85 @@ def test_tsn():
         style='pytorch')
     config.model['backbone'] = mmcls_backbone
 
-    recognizer = MODELS.build(config.model)
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
+
+    from mmcls.models import ResNeXt
+    mmcls_backbone['type'] = ResNeXt
+    config.model['backbone'] = mmcls_backbone
 
     input_shape = (1, 3, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
+    train_test_step(config, input_shape)
+
+
+def test_tsn_timm_backbone():
+    # test tsn from timm
+    register_all_modules()
+    config = get_recognizer_cfg(
+        'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
+    timm_backbone = dict(type='timm.efficientnet_b0', pretrained=False)
+    config.model['backbone'] = timm_backbone
+    config.model['cls_head']['in_channels'] = 1280
 
-    imgs = demo_inputs['imgs']
-    gt_labels = demo_inputs['gt_labels']
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
+    import timm
+    if digit_version(timm.__version__) <= digit_version('0.6.7'):
+        feature_shape = 'NLC'
+    else:
+        feature_shape = 'NHWC'
 
-    losses = recognizer(imgs, gt_labels)
-    assert isinstance(losses, torch.Tensor)
+    timm_swin = dict(
+        type='timm.swin_base_patch4_window7_224',
+        pretrained=False,
+        feature_shape=feature_shape)
+    config.model['backbone'] = timm_swin
+    config.model['cls_head']['in_channels'] = 1024
 
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
+    input_shape = (1, 3, 3, 224, 224)
+    train_test_step(config, input_shape)
 
 
-    # test mixup forward
-    # TODO
+def test_tsn_tv_backbone():
+    register_all_modules()
     config = get_recognizer_cfg(
-        'tsn/tsn_r50_video_mixup_1x1x8_100e_kinetics400_rgb.py')
+        'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
     config.model['backbone']['pretrained'] = None
-    recognizer = MODELS.build(config.model)
-    input_shape = (2, 8, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-    imgs = demo_inputs['imgs']
-    gt_labels = demo_inputs['gt_labels']
-    losses = recognizer(imgs, gt_labels)
-    assert isinstance(losses, torch.Tensor)
-
-    # test torchvision backbones
-    # TODO
+    # test tv backbone
     tv_backbone = dict(type='torchvision.densenet161', pretrained=True)
     config.model['backbone'] = tv_backbone
     config.model['cls_head']['in_channels'] = 2208
 
-    recognizer = MODELS.build(config.model)
-
     input_shape = (1, 3, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-
-    imgs = demo_inputs['imgs']
-    gt_labels = demo_inputs['gt_labels']
+    train_test_step(config, input_shape)
 
-    losses = recognizer(imgs, gt_labels)
-    assert isinstance(losses, torch.Tensor)
+    from torchvision.models import densenet161
+    tv_backbone = dict(type=densenet161, pretrained=True)
+    config.model['backbone'] = tv_backbone
+    config.model['cls_head']['in_channels'] = 2208
 
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
-    """
+    input_shape = (1, 3, 3, 32, 32)
+    train_test_step(config, input_shape)
 
 
 def test_tsm():
     register_all_modules()
     config = get_recognizer_cfg(
-        'tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py'  # noqa: E501
+        'tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-100e_kinetics400-rgb.py'  # noqa: E501
     )
     config.model['backbone']['pretrained'] = None
+    config.model['backbone']['pretrained2d'] = None
 
-    recognizer = MODELS.build(config.model)
-    recognizer.init_weights()
+    input_shape = (1, 8, 3, 32, 32)
+    train_test_step(config, input_shape)
 
     config = get_recognizer_cfg(
         'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py')
     config.model['backbone']['pretrained'] = None
-
-    recognizer = MODELS.build(config.model)
-    recognizer.init_weights()
+    config.model['backbone']['pretrained2d'] = None
 
     input_shape = (1, 8, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-
-    imgs = demo_inputs['imgs']
-    gt_labels = demo_inputs['gt_labels']
-
-    losses = recognizer(imgs, gt_labels)
-    assert isinstance(losses, torch.Tensor)
-
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
-
-    # test twice sample + 3 crops
-    input_shape = (2, 48, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-    imgs = demo_inputs['imgs']
-
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
-
-    # Test forward gradcam
-    recognizer(imgs, gradcam=True)
-    for one_img in img_list:
-        recognizer(one_img, gradcam=True)
+    train_test_step(config, input_shape)
 
 
 def test_trn():
@@ -160,38 +166,8 @@ def test_trn():
         'trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py')
     config.model['backbone']['pretrained'] = None
 
-    recognizer = MODELS.build(config.model)
-
     input_shape = (1, 8, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-
-    imgs = demo_inputs['imgs']
-    gt_labels = demo_inputs['gt_labels']
-
-    losses = recognizer(imgs, gt_labels)
-    assert isinstance(losses, torch.Tensor)
-
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
-
-    # test twice sample + 3 crops
-    input_shape = (2, 48, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-    imgs = demo_inputs['imgs']
-
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
-
-    # Test forward gradcam
-    recognizer(imgs, gradcam=True)
-    for one_img in img_list:
-        recognizer(one_img, gradcam=True)
+    train_test_step(config, input_shape)
 
 
 @pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
@@ -201,30 +177,8 @@ def test_tpn():
         'tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py')
     config.model['backbone']['pretrained'] = None
 
-    recognizer = MODELS.build(config.model)
-
-    input_shape = (1, 8, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-
-    imgs = demo_inputs['imgs']
-    gt_labels = demo_inputs['gt_labels']
-
-    losses = recognizer(imgs, gt_labels)
-
-    if not isinstance(losses, torch.Tensor):
-        for i in losses:
-            assert isinstance(i, torch.Tensor)
-
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
-
-    # Test forward gradcam
-    recognizer(imgs, gradcam=True)
-    for one_img in img_list:
-        recognizer(one_img, gradcam=True)
+    input_shape = (1, 8, 3, 64, 64)
+    train_test_step(config, input_shape)
 
 
 def test_tanet():
@@ -233,63 +187,5 @@ def test_tanet():
                                 'dense-1x1x8-100e_kinetics400-rgb.py')
     config.model['backbone']['pretrained'] = None
 
-    recognizer = MODELS.build(config.model)
-
     input_shape = (1, 8, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-
-    imgs = demo_inputs['imgs']
-    gt_labels = demo_inputs['gt_labels']
-
-    losses = recognizer(imgs, gt_labels)
-    assert isinstance(losses, torch.Tensor)
-
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
-
-    # test twice sample + 3 crops
-    input_shape = (2, 48, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-    imgs = demo_inputs['imgs']
-
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
-
-    # Test forward gradcam
-    recognizer(imgs, gradcam=True)
-    for one_img in img_list:
-        recognizer(one_img, gradcam=True)
-
-
-def test_timm_backbone():
-    # test tsn from timm
-    register_all_modules()
-    config = get_recognizer_cfg(
-        'tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py')
-    config.model['backbone']['pretrained'] = None
-    timm_backbone = dict(type='timm.efficientnet_b0', pretrained=False)
-    config.model['backbone'] = timm_backbone
-    config.model['cls_head']['in_channels'] = 1280
-
-    recognizer = MODELS.build(config.model)
-
-    input_shape = (1, 3, 3, 32, 32)
-    demo_inputs = generate_recognizer_demo_inputs(input_shape)
-
-    imgs = demo_inputs['imgs']
-    gt_labels = demo_inputs['gt_labels']
-
-    losses = recognizer(imgs, gt_labels)
-    assert isinstance(losses, torch.Tensor)
-
-    # Test forward test
-    with torch.no_grad():
-        img_list = [img[None, :] for img in imgs]
-        for one_img in img_list:
-            recognizer(one_img, None, return_loss=False)
+    train_test_step(config, input_shape)
diff --git a/tests/models/similarity/test_adapters.py b/tests/models/similarity/test_adapters.py
new file mode 100644
index 0000000000..56e44f2310
--- /dev/null
+++ b/tests/models/similarity/test_adapters.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import SimpleMeanAdapter, TransformerAdapter
+
+
+def test_transformer_adapter():
+    """Test transformer adapter."""
+    with pytest.raises(RuntimeError):
+        num_segs_model = 8
+        num_segs_features = 9
+        adapter = TransformerAdapter(
+            num_segs=num_segs_model,
+            transformer_width=64,
+            transformer_heads=8,
+            transformer_layers=2)
+        features = torch.randn(2, num_segs_features, 64)
+        adapter(features)
+
+    num_segs = 8
+    adapter = TransformerAdapter(
+        num_segs=num_segs,
+        transformer_width=64,
+        transformer_heads=8,
+        transformer_layers=2)
+    adapter.init_weights()
+    features = torch.randn(2, num_segs, 64)
+    adapted_features = adapter(features)
+    assert adapted_features.shape == torch.Size([2, 64])
+
+
+def test_simple_mean_adapter():
+    """Test simple mean adapter."""
+
+    adapter = SimpleMeanAdapter(dim=1)
+    features = torch.randn(2, 8, 64)
+    adapted_features = adapter(features)
+    assert adapted_features.shape == torch.Size([2, 64])
+
+    adapter = SimpleMeanAdapter(dim=(1, 2))
+    features = torch.randn(2, 8, 2, 64)
+    adapted_features = adapter(features)
+    assert adapted_features.shape == torch.Size([2, 64])
diff --git a/tests/models/similarity/test_clip_similarity.py b/tests/models/similarity/test_clip_similarity.py
new file mode 100644
index 0000000000..9afa158243
--- /dev/null
+++ b/tests/models/similarity/test_clip_similarity.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import MagicMock
+
+import torch
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+from mmaction.testing import get_similarity_cfg
+from mmaction.utils import register_all_modules
+
+
+def test_clip_similarity():
+    register_all_modules()
+    cfg = get_similarity_cfg(
+        'clip4clip/'
+        'clip4clip_vit-base-p32-res224-clip-pre_8xb16-u12-5e_msrvtt-9k-rgb.py')
+    cfg.model.frozen_layers = -1  # no frozen layers
+    model = MODELS.build(cfg.model)
+    model.train()
+
+    data_batch = {
+        'inputs': {
+            'imgs': [torch.randint(0, 256, (2, 3, 224, 224))],
+            'text': [torch.randint(0, 49408, (77, ))]
+        },
+        'data_samples': [ActionDataSample()]
+    }
+
+    # test train_step
+    optim_wrapper = MagicMock()
+    loss_vars = model.train_step(data_batch, optim_wrapper)
+    assert 'loss' in loss_vars
+    assert 'sim_loss_v2t' in loss_vars
+    assert 'sim_loss_t2v' in loss_vars
+    optim_wrapper.update_params.assert_called_once()
+
+    # test test_step
+    with torch.no_grad():
+        predictions = model.test_step(data_batch)
+    features = predictions[0].features
+    assert len(predictions) == 1
+    assert features.video_feature.size() == (512, )
+    assert features.text_feature.size() == (512, )
+
+    # test frozen layers
+    def check_frozen_layers(mdl, frozen_layers):
+        if frozen_layers >= 0:
+            top_layers = [
+                'ln_final', 'text_projection', 'logit_scale', 'visual.ln_post',
+                'visual.proj'
+            ]
+            mid_layers = [
+                'visual.transformer.resblocks', 'transformer.resblocks'
+            ]
+
+            for name, param in mdl.clip.named_parameters():
+                if any(name.find(n) == 0 for n in top_layers):
+                    assert param.requires_grad is True
+                elif any(name.find(n) == 0 for n in mid_layers):
+                    layer_n = int(name.split('.resblocks.')[1].split('.')[0])
+                    if layer_n >= frozen_layers:
+                        assert param.requires_grad is True
+                    else:
+                        assert param.requires_grad is False
+                else:
+                    assert param.requires_grad is False
+        else:
+            assert all([p.requires_grad for p in mdl.clip.parameters()])
+
+    check_frozen_layers(model, -1)
+
+    model.frozen_layers = 0
+    model.train()
+    check_frozen_layers(model, 0)
+
+    model.frozen_layers = 6
+    model.train()
+    check_frozen_layers(model, 6)
+
+    model.frozen_layers = 12
+    model.train()
+    check_frozen_layers(model, 12)
diff --git a/tests/models/utils/test_gradcam.py b/tests/models/utils/test_gradcam.py
index b53d37886f..e9568531c5 100644
--- a/tests/models/utils/test_gradcam.py
+++ b/tests/models/utils/test_gradcam.py
@@ -138,6 +138,7 @@ def test_slowfast():
     _do_test_3D_models(recognizer, target_layer_name, input_shape)
 
 
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
 def test_tsm():
     config = get_recognizer_cfg(
         'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py')
@@ -158,6 +159,7 @@ def test_tsm():
     _do_test_2D_models(recognizer, target_layer_name, input_shape)
 
 
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
 def test_csn():
     config = get_recognizer_cfg(
         'csn/ipcsn_ig65m-pretrained-r152-bnfrozen_32x2x1-58e_kinetics400-rgb.py'  # noqa: E501
diff --git a/tools/analysis_tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py
index 224b8364bc..696537ea4a 100644
--- a/tools/analysis_tools/confusion_matrix.py
+++ b/tools/analysis_tools/confusion_matrix.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
+import logging
 import tempfile
 
 import torch
@@ -83,6 +84,7 @@ def main():
             classes = runner.test_loop.dataloader.dataset.metainfo.get(
                 'classes')
             cm = runner.test()['confusion_matrix/result']
+            logging.shutdown()
     else:
         predictions = load(args.ckpt_or_result)
         evaluator = Evaluator(ConfusionMatrix())
diff --git a/tools/analysis_tools/eval_metric.py b/tools/analysis_tools/eval_metric.py
index 08b6da31e2..9e61ddc843 100644
--- a/tools/analysis_tools/eval_metric.py
+++ b/tools/analysis_tools/eval_metric.py
@@ -5,6 +5,7 @@
 from mmengine import Config, DictAction
 from mmengine.evaluator import Evaluator
 from mmengine.registry import init_default_scope
+from rich import print
 
 
 def parse_args():
diff --git a/tools/analysis_tools/report_accuracy.py b/tools/analysis_tools/report_accuracy.py
index 2008f9fb46..c361f644de 100644
--- a/tools/analysis_tools/report_accuracy.py
+++ b/tools/analysis_tools/report_accuracy.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 
+import numpy as np
 from mmengine import load
 from scipy.special import softmax
 
 from mmaction.evaluation.functional import (get_weighted_score,
                                             mean_class_accuracy,
+                                            mmit_mean_average_precision,
                                             top_k_accuracy)
 
 
@@ -23,6 +25,10 @@ def parse_args():
         help='coefficients of each score file',
         default=[1.0, 1.0])
     parser.add_argument('--apply-softmax', action='store_true')
+    parser.add_argument(
+        '--multi-label',
+        action='store_true',
+        help='whether the task is multi label classification')
     args = parser.parse_args()
     return args
 
@@ -37,9 +43,16 @@ def main():
             sample['pred_scores']['item'].numpy() for sample in data_samples
         ]
         score_list.append(scores)
-    labels = [
-        sample['gt_labels']['item'].item() for sample in data_sample_list[0]
-    ]
+
+    if args.multi_label:
+        labels = [
+            sample['gt_labels']['item'] for sample in data_sample_list[0]
+        ]
+    else:
+        labels = [
+            sample['gt_labels']['item'].item()
+            for sample in data_sample_list[0]
+        ]
 
     if args.apply_softmax:
 
@@ -49,11 +62,16 @@ def apply_softmax(scores):
         score_list = [apply_softmax(scores) for scores in score_list]
 
     weighted_scores = get_weighted_score(score_list, args.coefficients)
-    mean_class_acc = mean_class_accuracy(weighted_scores, labels)
-    top_1_acc, top_5_acc = top_k_accuracy(weighted_scores, labels, (1, 5))
-    print(f'Mean Class Accuracy: {mean_class_acc:.04f}')
-    print(f'Top 1 Accuracy: {top_1_acc:.04f}')
-    print(f'Top 5 Accuracy: {top_5_acc:.04f}')
+    if args.multi_label:
+        mean_avg_prec = mmit_mean_average_precision(
+            np.array(weighted_scores), np.stack([t.numpy() for t in labels]))
+        print(f'MMit Average Precision: {mean_avg_prec:.04f}')
+    else:
+        mean_class_acc = mean_class_accuracy(weighted_scores, labels)
+        top_1_acc, top_5_acc = top_k_accuracy(weighted_scores, labels, (1, 5))
+        print(f'Mean Class Accuracy: {mean_class_acc:.04f}')
+        print(f'Top 1 Accuracy: {top_1_acc:.04f}')
+        print(f'Top 5 Accuracy: {top_5_acc:.04f}')
 
 
 if __name__ == '__main__':
diff --git a/tools/analysis_tools/report_map.py b/tools/analysis_tools/report_map.py
index 2aa46a1c50..26d1bc0f9f 100644
--- a/tools/analysis_tools/report_map.py
+++ b/tools/analysis_tools/report_map.py
@@ -3,10 +3,10 @@
 import os
 import os.path as osp
 
-import mmcv
+import mmengine
 import numpy as np
 
-from mmaction.core import ActivityNetLocalization
+from mmaction.evaluation import ActivityNetLocalization
 
 args = None
 
@@ -17,9 +17,9 @@ def cuhk17_top1():
     if not osp.exists('cuhk_anet17_pred.json'):
         os.system('wget https://download.openmmlab.com/'
                   'mmaction/localization/cuhk_anet17_pred.json')
-    proposal = mmcv.load(args.proposal)
+    proposal = mmengine.load(args.proposal)
     results = proposal['results']
-    cuhk_pred = mmcv.load('cuhk_anet17_pred.json')['results']
+    cuhk_pred = mmengine.load('cuhk_anet17_pred.json')['results']
 
     def get_topk(preds, k):
         preds.sort(key=lambda x: x['score'])
@@ -36,7 +36,7 @@ def get_topk(preds, k):
             new_value.append(x)
         results[k] = new_value
     proposal['results'] = results
-    mmcv.dump(proposal, args.det_output)
+    mmengine.dump(proposal, args.det_output)
 
 
 cls_funcs = {'cuhk17_top1': cuhk17_top1}
diff --git a/tools/data/activitynet/README.md b/tools/data/activitynet/README.md
index f3286f6fc1..17daef6acd 100644
--- a/tools/data/activitynet/README.md
+++ b/tools/data/activitynet/README.md
@@ -78,7 +78,7 @@ For this case, the downloading scripts update the annotation file after download
 
 ### Step 3. Extract RGB and Flow
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 Use following scripts to extract both RGB and Flow.
 
@@ -87,7 +87,7 @@ bash extract_frames.sh
 ```
 
 The command above can generate images with new short edge 256. If you want to generate images with short edge 320 (320p), or with fix size 340x256, you can change the args `--new-short 256` to `--new-short 320` or `--new-width 340 --new-height 256`.
-More details can be found in [data_preparation](/docs/data_preparation.md)
+More details can be found in [prepare dataset](/docs/en/user_guides/prepare_dataset.md)
 
 ### Step 4. Generate File List for ActivityNet Finetuning
 
@@ -108,13 +108,29 @@ Both RGB models and Flow models are supported.
 After finetuning TSN on ActivityNet, you can use it to extract both RGB and Flow feature.
 
 ```shell
-python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_train_video.txt --output-prefix ../../../data/ActivityNet/rgb_feat --modality RGB --ckpt /path/to/rgb_checkpoint.pth
-
-python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_val_video.txt --output-prefix ../../../data/ActivityNet/rgb_feat --modality RGB --ckpt /path/to/rgb_checkpoint.pth
-
-python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_train_video.txt --output-prefix ../../../data/ActivityNet/flow_feat --modality Flow --ckpt /path/to/flow_checkpoint.pth
-
-python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_val_video.txt --output-prefix ../../../data/ActivityNet/flow_feat --modality Flow --ckpt /path/to/flow_checkpoint.pth
+python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \
+  /path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_tarin_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_train_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \
+  path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_val_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_val_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \
+  /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_tarin_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_train_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \
+  /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_val_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_val_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
 ```
 
 After feature extraction, you can use our post processing scripts to concat RGB and Flow feature, generate the `100-t X 400-d` feature for Action Detection.
@@ -168,4 +184,4 @@ mmaction2
 
 ```
 
-For training and evaluating on ActivityNet, please refer to [getting_started.md](/docs/getting_started.md).
+For training and evaluating on ActivityNet, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/activitynet/README_zh-CN.md b/tools/data/activitynet/README_zh-CN.md
index 7687b948db..83969041c2 100644
--- a/tools/data/activitynet/README_zh-CN.md
+++ b/tools/data/activitynet/README_zh-CN.md
@@ -78,7 +78,7 @@ bash download_bsn_videos.sh
 
 ### 步骤 3. 抽取 RGB 帧和光流
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 可使用以下命令抽取视频帧和光流。
 
@@ -87,7 +87,7 @@ bash extract_frames.sh
 ```
 
 以上脚本将会生成短边 256 分辨率的视频。如果用户想生成短边 320 分辨率的视频（即 320p），或者 340x256 的固定分辨率，用户可以通过改变参数由 `--new-short 256` 至 `--new-short 320`，或者 `--new-width 340 --new-height 256` 进行设置
-更多细节可参考 [数据准备指南](/docs_zh_CN/data_preparation.md)
+更多细节可参考 [数据准备指南](/docs/zh_cn/user_guides/prepare_dataset.md)
 
 ### 步骤 4. 生成用于 ActivityNet 微调的文件列表
 
@@ -107,13 +107,29 @@ python generate_rawframes_filelist.py
 在 ActivityNet 上微调 TSN 模型之后，用户可以使用该模型进行 RGB 特征和光流特征的提取。
 
 ```shell
-python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_train_video.txt --output-prefix ../../../data/ActivityNet/rgb_feat --modality RGB --ckpt /path/to/rgb_checkpoint.pth
-
-python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_val_video.txt --output-prefix ../../../data/ActivityNet/rgb_feat --modality RGB --ckpt /path/to/rgb_checkpoint.pth
-
-python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_train_video.txt --output-prefix ../../../data/ActivityNet/flow_feat --modality Flow --ckpt /path/to/flow_checkpoint.pth
-
-python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_val_video.txt --output-prefix ../../../data/ActivityNet/flow_feat --modality Flow --ckpt /path/to/flow_checkpoint.pth
+python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \
+  /path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_tarin_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_train_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_rgb_feat_config.py \
+  path/to/rgb_checkpoint.pth ../../../data/ActivityNet/rgb_val_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_val_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \
+  /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_tarin_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_train_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
+
+python ../../misc/clip_feature_extraction.py tsn_extract_flow_feat_config.py \
+  /path/to/flow_checkpoint.pth ../../../data/ActivityNet/flow_val_feat.pkl \
+  --video-list ../../../data/ActivityNet/anet_val_video.txt \
+  --video-root ../../../data/ActivityNet/rawframes \
+  --dump-score
 ```
 
 在提取完特征后，用户可以使用后处理脚本整合 RGB 特征和光流特征，生成 `100-t X 400-d` 维度的特征用于时序动作检测。
@@ -166,4 +182,4 @@ mmaction2
 
 ```
 
-关于对 ActivityNet 进行训练和验证，可以参考 [基础教程](/docs_zh_CN/getting_started.md).
+关于对 ActivityNet 进行训练和验证，可以参考 [训练教程](/docs/zh_cn/user_guides/train_test.md).
diff --git a/tools/data/activitynet/activitynet_feature_postprocessing.py b/tools/data/activitynet/activitynet_feature_postprocessing.py
index 8dcd7bfe26..e1d8369886 100644
--- a/tools/data/activitynet/activitynet_feature_postprocessing.py
+++ b/tools/data/activitynet/activitynet_feature_postprocessing.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 import scipy.interpolate
-from mmcv import dump, load
+from mmengine import dump, load
 
 args = None
 
@@ -88,9 +88,13 @@ def merge_feat(name):
 def main():
     global args
     args = parse_args()
-    rgb_feat = os.listdir(args.rgb)
-    flow_feat = os.listdir(args.flow)
+    rgb_feat = [file for file in os.listdir(args.rgb) if file.endswith('.pkl')]
+    flow_feat = [
+        file for file in os.listdir(args.flow) if file.endswith('.pkl')
+    ]
     assert set(rgb_feat) == set(flow_feat)
+    # for feat in rgb_feat:
+    #     merge_feat(feat)
     pool = multiprocessing.Pool(32)
     pool.map(merge_feat, rgb_feat)
 
diff --git a/tools/data/activitynet/convert_proposal_format.py b/tools/data/activitynet/convert_proposal_format.py
index f2f8613eb4..6f5992c75a 100644
--- a/tools/data/activitynet/convert_proposal_format.py
+++ b/tools/data/activitynet/convert_proposal_format.py
@@ -4,16 +4,16 @@
 P-GCN, not including TSN, I3D etc.)."""
 import argparse
 
-import mmcv
+import mmengine
 import numpy as np
 
-from mmaction.core import pairwise_temporal_iou
+from mmaction.evaluation import pairwise_temporal_iou
 
 
 def load_annotations(ann_file):
     """Load the annotation according to ann_file into video_infos."""
     video_infos = []
-    anno_database = mmcv.load(ann_file)
+    anno_database = mmengine.load(ann_file)
     for video_name in anno_database:
         video_info = anno_database[video_name]
         video_info['video_name'] = video_name
@@ -143,7 +143,7 @@ def parse_args():
     video_infos = load_annotations(args.ann_file)
     ground_truth = import_ground_truth(video_infos, activity_index)
     proposal, num_proposals = import_proposals(
-        mmcv.load(args.proposal_file)['results'])
+        mmengine.load(args.proposal_file)['results'])
     video_idx = 0
 
     for video_info in video_infos:
diff --git a/tools/data/activitynet/download.py b/tools/data/activitynet/download.py
index 1d1bf41a2d..4e614e83b7 100644
--- a/tools/data/activitynet/download.py
+++ b/tools/data/activitynet/download.py
@@ -7,7 +7,7 @@
 import ssl
 import subprocess
 
-import mmcv
+import mmengine
 from joblib import Parallel, delayed
 
 ssl._create_default_https_context = ssl._create_unverified_context
@@ -101,7 +101,7 @@ def parse_activitynet_annotations(input_csv, is_bsn_case=False):
         # YoutubeIDs do not have prefix `v_`
         youtube_ids = [x.split(',')[0][2:] for x in lines]
     else:
-        data = mmcv.load(anno_file)['database']
+        data = mmengine.load(anno_file)['database']
         youtube_ids = list(data.keys())
 
     return youtube_ids
@@ -125,15 +125,15 @@ def main(input_csv, output_dir, anno_file, num_jobs=24, is_bsn_case=False):
             for index in youtube_ids)
 
     # Save download report.
-    mmcv.dump(status_list, 'download_report.json')
-    annotation = mmcv.load(anno_file)
+    mmengine.dump(status_list, 'download_report.json')
+    annotation = mmengine.load(anno_file)
     downloaded = {status[0]: status[1] for status in status_list}
     annotation = {k: v for k, v in annotation.items() if downloaded[k]}
 
     if is_bsn_case:
         anno_file_bak = anno_file.replace('.json', '_bak.json')
         os.rename(anno_file, anno_file_bak)
-        mmcv.dump(annotation, anno_file)
+        mmengine.dump(annotation, anno_file)
 
 
 if __name__ == '__main__':
diff --git a/tools/data/activitynet/tsn_extract_flow_feat_config.py b/tools/data/activitynet/tsn_extract_flow_feat_config.py
new file mode 100644
index 0000000000..e09c3f99c1
--- /dev/null
+++ b/tools/data/activitynet/tsn_extract_flow_feat_config.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = [
+    'mmaction::_base_/models/tsn_r50.py', 'mmaction::_base_/default_runtime.py'
+]
+
+clip_len = 5
+model = dict(
+    backbone=dict(in_channels=2 * clip_len),
+    data_preprocessor=dict(mean=[128], std=[128]))
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='UntrimmedSampleFrames', clip_len=clip_len, clip_interval=16),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCHW_Flow'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(img=data_root_val),
+        pipeline=test_pipeline,
+        filename_tmpl='{}_{:05d}.jpg',
+        modality='Flow',
+        test_mode=True))
+
+test_evaluator = []
+
+test_cfg = dict(type='TestLoop')
diff --git a/tools/data/activitynet/tsn_extract_rgb_feat_config.py b/tools/data/activitynet/tsn_extract_rgb_feat_config.py
new file mode 100644
index 0000000000..803e031935
--- /dev/null
+++ b/tools/data/activitynet/tsn_extract_rgb_feat_config.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = [
+    'mmaction::_base_/models/tsn_r50.py', 'mmaction::_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='UntrimmedSampleFrames', clip_len=1, clip_interval=16),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(img=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = []
+
+test_cfg = dict(type='TestLoop')
diff --git a/tools/data/activitynet/tsn_extract_video_feat_config.py b/tools/data/activitynet/tsn_extract_video_feat_config.py
new file mode 100644
index 0000000000..ab815c1318
--- /dev/null
+++ b/tools/data/activitynet/tsn_extract_video_feat_config.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = [
+    'mmaction::_base_/models/tsn_r50.py', 'mmaction::_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UntrimmedSampleFrames', clip_len=1, clip_interval=16),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = []
+
+test_cfg = dict(type='TestLoop')
diff --git a/tools/data/activitynet/tsn_feature_extraction.py b/tools/data/activitynet/tsn_feature_extraction.py
deleted file mode 100644
index db8318a533..0000000000
--- a/tools/data/activitynet/tsn_feature_extraction.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import os
-import os.path as osp
-import pickle
-
-import mmcv
-import numpy as np
-import torch
-
-from mmaction.datasets.transforms import Compose
-from mmaction.models import build_model
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Extract TSN Feature')
-    parser.add_argument('--data-prefix', default='', help='dataset prefix')
-    parser.add_argument('--output-prefix', default='', help='output prefix')
-    parser.add_argument(
-        '--data-list',
-        help='video list of the dataset, the format should be '
-        '`frame_dir num_frames output_file`')
-    parser.add_argument(
-        '--frame-interval',
-        type=int,
-        default=16,
-        help='the sampling frequency of frame in the untrimed video')
-    parser.add_argument('--modality', default='RGB', choices=['RGB', 'Flow'])
-    parser.add_argument('--ckpt', help='checkpoint for feature extraction')
-    parser.add_argument(
-        '--part',
-        type=int,
-        default=0,
-        help='which part of dataset to forward(alldata[part::total])')
-    parser.add_argument(
-        '--total', type=int, default=1, help='how many parts exist')
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-    args.is_rgb = args.modality == 'RGB'
-    args.clip_len = 1 if args.is_rgb else 5
-    args.input_format = 'NCHW' if args.is_rgb else 'NCHW_Flow'
-    rgb_norm_cfg = dict(
-        mean=[123.675, 116.28, 103.53],
-        std=[58.395, 57.12, 57.375],
-        to_bgr=False)
-    flow_norm_cfg = dict(mean=[128, 128], std=[128, 128])
-    args.img_norm_cfg = rgb_norm_cfg if args.is_rgb else flow_norm_cfg
-    args.f_tmpl = 'img_{:05d}.jpg' if args.is_rgb else 'flow_{}_{:05d}.jpg'
-    args.in_channels = args.clip_len * (3 if args.is_rgb else 2)
-    # max batch_size for one forward
-    args.batch_size = 200
-
-    # define the data pipeline for Untrimmed Videos
-    data_pipeline = [
-        dict(
-            type='UntrimmedSampleFrames',
-            clip_len=args.clip_len,
-            frame_interval=args.frame_interval,
-            start_index=0),
-        dict(type='RawFrameDecode'),
-        dict(type='Resize', scale=(-1, 256)),
-        dict(type='CenterCrop', crop_size=256),
-        dict(type='Normalize', **args.img_norm_cfg),
-        dict(type='FormatShape', input_format=args.input_format),
-        dict(type='Collect', keys=['imgs'], meta_keys=[]),
-        dict(type='ToTensor', keys=['imgs'])
-    ]
-    data_pipeline = Compose(data_pipeline)
-
-    # define TSN R50 model, the model is used as the feature extractor
-    model_cfg = dict(
-        type='Recognizer2D',
-        backbone=dict(
-            type='ResNet',
-            depth=50,
-            in_channels=args.in_channels,
-            norm_eval=False),
-        cls_head=dict(
-            type='TSNHead',
-            num_classes=200,
-            in_channels=2048,
-            spatial_type='avg',
-            consensus=dict(type='AvgConsensus', dim=1)),
-        test_cfg=dict(average_clips=None))
-    model = build_model(model_cfg)
-    # load pretrained weight into the feature extractor
-    state_dict = torch.load(args.ckpt)['state_dict']
-    model.load_state_dict(state_dict)
-    model = model.cuda()
-    model.eval()
-
-    data = open(args.data_list).readlines()
-    data = [x.strip() for x in data]
-    data = data[args.part::args.total]
-
-    # enumerate Untrimmed videos, extract feature from each of them
-    prog_bar = mmcv.ProgressBar(len(data))
-    if not osp.exists(args.output_prefix):
-        os.system(f'mkdir -p {args.output_prefix}')
-
-    for item in data:
-        frame_dir, length, _ = item.split()
-        output_file = osp.basename(frame_dir) + '.pkl'
-        frame_dir = osp.join(args.data_prefix, frame_dir)
-        output_file = osp.join(args.output_prefix, output_file)
-        assert output_file.endswith('.pkl')
-        length = int(length)
-
-        # prepare a pseudo sample
-        tmpl = dict(
-            frame_dir=frame_dir,
-            total_frames=length,
-            filename_tmpl=args.f_tmpl,
-            start_index=0,
-            modality=args.modality)
-        sample = data_pipeline(tmpl)
-        imgs = sample['imgs']
-        shape = imgs.shape
-        # the original shape should be N_seg * C * H * W, resize it to N_seg *
-        # 1 * C * H * W so that the network return feature of each frame (No
-        # score average among segments)
-        imgs = imgs.reshape((shape[0], 1) + shape[1:])
-        imgs = imgs.cuda()
-
-        def forward_data(model, data):
-            # chop large data into pieces and extract feature from them
-            results = []
-            start_idx = 0
-            num_clip = data.shape[0]
-            while start_idx < num_clip:
-                with torch.no_grad():
-                    part = data[start_idx:start_idx + args.batch_size]
-                    feat = model.forward(part, return_loss=False)
-                    results.append(feat)
-                    start_idx += args.batch_size
-            return np.concatenate(results)
-
-        feat = forward_data(model, imgs)
-        with open(output_file, 'wb') as fout:
-            pickle.dump(feat, fout)
-        prog_bar.update()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/data/ava/README.md b/tools/data/ava/README.md
index a416eb2632..ed5089b7d1 100644
--- a/tools/data/ava/README.md
+++ b/tools/data/ava/README.md
@@ -64,7 +64,7 @@ bash cut_videos.sh
 
 ## Step 4. Extract RGB and Flow
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. And you can run the following script to soft link the extracted frames.
 
@@ -141,7 +141,7 @@ mmaction2
 |   │   │   │   ├── ...
 ```
 
-For training and evaluating on AVA, please refer to [getting_started](/docs/getting_started.md).
+For training and evaluating on AVA, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Reference
 
diff --git a/tools/data/ava/README_zh-CN.md b/tools/data/ava/README_zh-CN.md
index 5a7b96da88..5a712798e5 100644
--- a/tools/data/ava/README_zh-CN.md
+++ b/tools/data/ava/README_zh-CN.md
@@ -56,7 +56,7 @@ bash cut_videos.sh
 
 ## 4. 提取 RGB 帧和光流
 
-在提取之前，请参考 [安装教程](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在提取之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果用户有足够的 SSD 空间，那么建议将视频抽取为 RGB 帧以提升 I/O 性能。用户可以使用以下脚本为抽取得到的帧文件夹建立软连接：
 
@@ -131,4 +131,4 @@ mmaction2
 |   │   │   │   ├── ...
 ```
 
-关于 AVA 数据集上的训练与测试，请参照 [基础教程](/docs_zh_CN/getting_started.md)。
+关于 AVA 数据集上的训练与测试，请参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/ava/download_videos_parallel.py b/tools/data/ava/download_videos_parallel.py
index 7be4b1b883..524cf37d2e 100644
--- a/tools/data/ava/download_videos_parallel.py
+++ b/tools/data/ava/download_videos_parallel.py
@@ -3,7 +3,7 @@
 import os.path as osp
 import subprocess
 
-import mmcv
+import mmengine
 from joblib import Parallel, delayed
 
 URL_PREFIX = 'https://s3.amazonaws.com/ava-dataset/trainval/'
@@ -36,7 +36,7 @@ def download_video(video_url, output_dir, num_attempts=5):
 
 
 def main(source_file, output_dir, num_jobs=24, num_attempts=5):
-    mmcv.mkdir_or_exist(output_dir)
+    mmengine.mkdir_or_exist(output_dir)
     video_list = open(source_file).read().strip().split('\n')
     video_list = [osp.join(URL_PREFIX, video) for video in video_list]
 
@@ -49,7 +49,7 @@ def main(source_file, output_dir, num_jobs=24, num_attempts=5):
             delayed(download_video)(video, output_dir, num_attempts)
             for video in video_list)
 
-    mmcv.dump(status_list, 'download_report.json')
+    mmengine.dump(status_list, 'download_report.json')
 
 
 if __name__ == '__main__':
diff --git a/tools/data/ava_kinetics/prepare_annotation.py b/tools/data/ava_kinetics/prepare_annotation.py
index 00b7669d49..2e53916540 100644
--- a/tools/data/ava_kinetics/prepare_annotation.py
+++ b/tools/data/ava_kinetics/prepare_annotation.py
@@ -60,7 +60,7 @@ def filter_train_list(kinetics_anotation_file, lookup):
         '--avakinetics_anotation',
         type=str,
         default='./ava_kinetics_v1_0',
-        help='the directory to ava-kinetics anotations')
+        help='the directory to ava-kinetics annotations')
     p.add_argument(
         '--num_workers',
         type=int,
diff --git a/tools/data/diving48/README.md b/tools/data/diving48/README.md
index 588cddd173..02d19d5a62 100644
--- a/tools/data/diving48/README.md
+++ b/tools/data/diving48/README.md
@@ -15,11 +15,28 @@
 ```
 
 For basic dataset information, you can refer to the official dataset [website](http://www.svcl.ucsd.edu/projects/resound/dataset.html).
-Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/diving48/`.
+
+`````{tabs}
+
+````{group-tab} Download by MIM
+MIM supports downloading from OpenDataLab and preprocessing Diving48 dataset with one command line.
+```Bash
+# install OpenDataLab CLI tools
+pip install -U opendatalab
+# log in OpenDataLab
+odl login
+# download and preprocess by MIM
+mim download mmaction2 --dataset diving48
+```
+
+````
+
+````{group-tab} Download form Official Source
 
 ## Step 1. Prepare Annotations
 
 You can run the following script to download annotations (considering the correctness of annotation files, we only download V2 version here).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/diving48/`.
 
 ```shell
 bash download_annotations.sh
@@ -39,7 +56,7 @@ This part is **optional** if you only want to use the video loader.
 
 The frames provided in official compressed file are not complete. You may need to go through the following extraction steps to get the complete frames.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
 
@@ -81,7 +98,10 @@ bash generate_videos_filelist.sh
 bash generate_rawframes_filelist.sh
 ```
 
-## Step 5. Check Directory Structure
+````
+`````
+
+### Check Directory Structure
 
 After the whole data process for Diving48 preparation,
 you will get the rawframes (RGB + Flow), videos and annotation files for Diving48.
@@ -97,7 +117,7 @@ mmaction2
 │   ├── diving48
 │   │   ├── diving48_{train,val}_list_rawframes.txt
 │   │   ├── diving48_{train,val}_list_videos.txt
-│   │   ├── annotations
+│   │   ├── annotations (optinonal)
 │   |   |   ├── Diving48_V2_train.json
 │   |   |   ├── Diving48_V2_test.json
 │   |   |   ├── Diving48_vocab.json
@@ -105,7 +125,7 @@ mmaction2
 │   |   |   ├── _8Vy3dlHg2w_00000.mp4
 │   |   |   ├── _8Vy3dlHg2w_00001.mp4
 │   |   |   ├── ...
-│   |   ├── rawframes
+│   |   ├── rawframes (optional)
 │   |   |   ├── 2x00lRzlTVQ_00000
 │   |   |   |   ├── img_00001.jpg
 │   |   |   |   ├── img_00002.jpg
@@ -120,4 +140,4 @@ mmaction2
 │   |   |   ├── ...
 ```
 
-For training and evaluating on Diving48, please refer to [getting_started.md](/docs/getting_started.md).
+For training and evaluating on Diving48, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/diving48/README_zh-CN.md b/tools/data/diving48/README_zh-CN.md
index e91f8729a5..825344039e 100644
--- a/tools/data/diving48/README_zh-CN.md
+++ b/tools/data/diving48/README_zh-CN.md
@@ -15,11 +15,26 @@
 ```
 
 用户可参考该数据集的 [官网](http://www.svcl.ucsd.edu/projects/resound/dataset.html)，以获取数据集相关的基本信息。
-在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/diving48/`。
 
+`````{tabs}
+
+````{group-tab} 使用 MIM 下载
+# MIM 支持下载 Diving48 数据集。用户可以通过一行命令，从 OpenDataLab 进行下载，并进行预处理。
+```Bash
+# 安装 OpenDataLab CLI 工具
+pip install -U opendatalab
+# 登录 OpenDataLab
+odl login
+# 通过 MIM 进行数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset diving48
+```
+
+````
+
+````{group-tab} 从官方源下载
 ## 步骤 1. 下载标注文件
 
-用户可以使用以下命令下载标注文件（考虑到标注的准确性，这里仅下载 V2 版本）。
+用户可以使用以下命令下载标注文件（考虑到标注的准确性，这里仅下载 V2 版本）。在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/diving48/`。
 
 ```shell
 bash download_annotations.sh
@@ -39,7 +54,7 @@ bash download_videos.sh
 
 官网提供的帧压缩包并不完整。若想获取完整的数据，可以使用以下步骤解帧。
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。
 
@@ -81,7 +96,10 @@ bash generate_videos_filelist.sh
 bash generate_rawframes_filelist.sh
 ```
 
-## 步骤 5. 检查文件夹结构
+````
+`````
+
+### 检查文件夹结构
 
 在完成所有 Diving48 数据集准备流程后，
 用户可以获得对应的 RGB + 光流文件，视频文件以及标注文件。
@@ -97,7 +115,7 @@ mmaction2
 │   ├── diving48
 │   │   ├── diving48_{train,val}_list_rawframes.txt
 │   │   ├── diving48_{train,val}_list_videos.txt
-│   │   ├── annotations
+│   │   ├── annotations（可选）
 │   |   |   ├── Diving48_V2_train.json
 │   |   |   ├── Diving48_V2_test.json
 │   |   |   ├── Diving48_vocab.json
@@ -105,7 +123,7 @@ mmaction2
 │   |   |   ├── _8Vy3dlHg2w_00000.mp4
 │   |   |   ├── _8Vy3dlHg2w_00001.mp4
 │   |   |   ├── ...
-│   |   ├── rawframes
+│   |   ├── rawframes（可选）
 │   |   |   ├── 2x00lRzlTVQ_00000
 │   |   |   |   ├── img_00001.jpg
 │   |   |   |   ├── img_00002.jpg
@@ -120,4 +138,4 @@ mmaction2
 │   |   |   ├── ...
 ```
 
-关于对 Diving48 进行训练和验证，可以参考 [基础教程](/docs_zh_CN/getting_started.md)。
+关于对 Diving48 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/diving48/preprocess.sh b/tools/data/diving48/preprocess.sh
new file mode 100644
index 0000000000..10d9e42044
--- /dev/null
+++ b/tools/data/diving48/preprocess.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/diving48/raw/*.tar.gz.*  | tar -xvz -C $(dirname $DATA_ROOT)
+tar -xvf $DATA_ROOT/diving48.tar -C $(dirname $DATA_ROOT)
+rm $DATA_ROOT/diving48.tar
diff --git a/tools/data/gym/README.md b/tools/data/gym/README.md
index a39eda6fd4..84386c012e 100644
--- a/tools/data/gym/README.md
+++ b/tools/data/gym/README.md
@@ -55,7 +55,7 @@ python trim_subaction.py
 
 This part is **optional** if you only want to use the video loader for RGB model training.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 Run the following script to extract both rgb and flow using "tvl1" algorithm.
 
@@ -106,4 +106,4 @@ mmaction2
 |   |   └── subaction_frames
 ```
 
-For training and evaluating on GYM, please refer to [getting_started](/docs/getting_started.md).
+For training and evaluating on GYM, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/gym/README_zh-CN.md b/tools/data/gym/README_zh-CN.md
index cb3a796ec7..0b3e54aeb1 100644
--- a/tools/data/gym/README_zh-CN.md
+++ b/tools/data/gym/README_zh-CN.md
@@ -55,7 +55,7 @@ python trim_subaction.py
 
 如果用户仅使用 video loader，则可以跳过本步。
 
-在提取之前，请参考 [安装教程](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在提取之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 用户可使用如下脚本同时抽取 RGB 帧和光流（提取光流时使用 tvl1 算法）：
 
@@ -106,4 +106,4 @@ mmaction2
 |   |   └── subaction_frames
 ```
 
-关于 GYM 数据集上的训练与测试，请参照 [基础教程](/docs_zh_CN/getting_started.md)。
+关于 GYM 数据集上的训练与测试，请参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/hacs/README-CN.md b/tools/data/hacs/README-CN.md
new file mode 100644
index 0000000000..245d6313fe
--- /dev/null
+++ b/tools/data/hacs/README-CN.md
@@ -0,0 +1,119 @@
+# 准备 HACS Segments
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{zhao2019hacs,
+  title={Hacs: Human action clips and segments dataset for recognition and temporal localization},
+  author={Zhao, Hang and Torralba, Antonio and Torresani, Lorenzo and Yan, Zhicheng},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={8668--8678},
+  year={2019}
+}
+```
+
+### 0. 下载视频
+
+在我们开始准备数据集之前，请按照[官方代码库](https://github.com/hangzhaomit/HACS-dataset)的指令下载HACS Segments数据集中的视频。如果有视频缺失，您可以向HACS数据集存储库的维护者提交请求以获取缺失的视频。但是如果一些视频缺失，您仍然可以为MMAction2准备数据集。
+
+在下载完数据集后，请将数据集文件夹移动到(或者使用软链接)`$MMACTION2/tools/data/hacs/`。文件夹结构应该如下所示：
+
+```
+mmaction2
+├── mmaction
+├── data
+├── configs
+├── tools
+│   ├── hacs
+│   │   ├── slowonly_feature_infer.py
+│   │   ├── ..
+│   │   ├── data
+│   │   │   ├── Applying_sunscreen
+│   │   │   │   ├── v_0Ch__DqMPwA.mp4
+│   │   │   │   ├── v_9CTDjFHl8WE.mp4
+│   │   │   │   ├── ..
+
+
+```
+
+在开始之前，请确保您位于`$MMACTION2/tools/data/hacs/`路径下。
+
+### 1. 提取特征
+
+以下是使用[SlowOnly ResNet50 8x8](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py)在Kinetics700数据集上预训练的模型，从HACS视频中提取特征。对于每个视频，我们均匀采样100个视频片段，并提取700维输出（softmax之前）作为特征，即特征形状为100x700。
+
+首先，我们使用如下命令生成数据集的视频列表：
+
+```
+python generate_list.py
+```
+
+这将生成一个位于`$MMACTION2/tools/data/hacs/`的`hacs_data.txt`文件，其内容格式如下：
+
+```
+Horseback_riding/v_Sr2BSq_8FMw.mp4 0
+Horseback_riding/v_EQb6OKoqz3Q.mp4 1
+Horseback_riding/v_vYKUV8TRngg.mp4 2
+Horseback_riding/v_Y8U0X1F-0ck.mp4 3
+Horseback_riding/v_hnspbB7wNh0.mp4 4
+Horseback_riding/v_HPhlhrT9IOk.mp4 5
+```
+
+接下来，我们使用[slowonly_feature_infer.py](/tools/data/hacs/slowonly_feature_infer.py) 配置文件来提取特征：
+
+```
+# 指定提取特征的GPU数量
+NUM_GPUS=8
+
+# 下载预训练模型权重
+wget https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth
+
+bash ../mmaction2/tools/dist_test.sh \
+    slowonly_feature_infer.py \
+    slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth \
+    $NUM_GPUS --dump result.pkl
+```
+
+我们将得到一个名为 `result.pkl` 的文件，其中包含每个视频的大小为100x700的特征。我们将特征重写为csv格式，并保存在 `$MMACTION2/data/HACS/` 目录下。
+
+```
+＃确保您位于$ $MMACTION2/tools/data/hacs/
+python write_feature_csv.py
+```
+
+### 2. 准备标注文件
+
+我们首先从官方仓库下载标注文件：
+
+```
+wget https://github.com/hangzhaomit/HACS-dataset/raw/master/HACS_v1.1.1.zip
+unzip HACS_v1.1.1.zip
+```
+
+解压缩后，应该有一个名为`HACS_v1.1.1`的文件夹，其中包含一个名为`HACS_segments_v1.1.1.json`的文件。
+
+我们在`$MMACTION2/data/HACS/`目录下生成`hacs_anno_train.json`、`hacs_anno_val.json`和`hacs_anno_test.json`文件：
+
+```
+python3 generate_anotations.py
+```
+
+完成这两个步骤后，HACS Segments数据集的文件夹结构应该如下所示：
+
+```
+mmaction2
+├── mmaction
+├── data
+│   ├── HACS
+│   │   ├── hacs_anno_train.json
+│   │   ├── hacs_anno_val.json
+│   │   ├── hacs_anno_test.json
+│   │   ├── slowonly_feature
+│   │   │   ├── v_008gY2B8Pf4.csv
+│   │   │   ├── v_0095rqic1n8.csv
+├── configs
+├── tools
+
+```
diff --git a/tools/data/hacs/README.md b/tools/data/hacs/README.md
new file mode 100644
index 0000000000..d567b60277
--- /dev/null
+++ b/tools/data/hacs/README.md
@@ -0,0 +1,119 @@
+# Preparing HACS Segments
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{zhao2019hacs,
+  title={Hacs: Human action clips and segments dataset for recognition and temporal localization},
+  author={Zhao, Hang and Torralba, Antonio and Torresani, Lorenzo and Yan, Zhicheng},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={8668--8678},
+  year={2019}
+}
+```
+
+### Step 0. Download Videos
+
+Before we start preparing the dataset, please following the official [repository](https://github.com/hangzhaomit/HACS-dataset) to download videos from the HACS Segments dataset. You can submit a request for missing videos to the maintainer of the HACS dataset repository. But you can still prepare the dataset for MMAction2 if some videos are missing.
+
+After you finish downloading the dataset, please move the dataset folder to `$MMACTION2/tools/data/hacs/` or use a soft link. The the folder structure should look like:
+
+```
+mmaction2
+├── mmaction
+├── data
+├── configs
+├── tools
+│   ├── hacs
+│   │   ├── slowonly_feature_infer.py
+│   │   ├── ..
+│   │   ├── data
+│   │   │   ├── Applying_sunscreen
+│   │   │   │   ├── v_0Ch__DqMPwA.mp4
+│   │   │   │   ├── v_9CTDjFHl8WE.mp4
+│   │   │   │   ├── ..
+
+
+```
+
+Before we start, make sure you are at `$MMACTION2/tools/data/hacs/`.
+
+### Step 1. Extract Features
+
+We extract features from the HACS videos using [SlowOnly ResNet50 8x8](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-steplr-150e_kinetics700-rgb.py) pretrained on Kinetics700 dataset. For each video, we uniformly sample 100 video clips and extract the 700-dimensional output (before softmax) as the feature, i.e., the feature shape is 100x700.
+
+First, we generate a video list of the dataset:
+
+```
+python generate_list.py
+```
+
+It will generate an `hacs_data.txt` file located at `$MMACTION2/tools/data/hacs/` which looks like:
+
+```
+Horseback_riding/v_Sr2BSq_8FMw.mp4 0
+Horseback_riding/v_EQb6OKoqz3Q.mp4 1
+Horseback_riding/v_vYKUV8TRngg.mp4 2
+Horseback_riding/v_Y8U0X1F-0ck.mp4 3
+Horseback_riding/v_hnspbB7wNh0.mp4 4
+Horseback_riding/v_HPhlhrT9IOk.mp4 5
+```
+
+Next we use the [slowonly_feature_infer.py](/tools/data/hacs/slowonly_feature_infer.py) config to extract features:
+
+```
+# number of GPUs to extract feature
+NUM_GPUS=8
+
+# download the pretraining checkpoint
+wget https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth
+
+bash ../mmaction2/tools/dist_test.sh \
+    slowonly_feature_infer.py \
+    slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb_20221013-15b93b10.pth \
+    $NUM_GPUS --dump result.pkl
+```
+
+We will get a `result.pkl` that contains the 100x700 feature for each video. We re-write the features into csv format at `$MMACTION2/data/HACS/`:
+
+```
+# Make sure you are at $MMACTION2/tools/data/hacs/
+python write_feature_csv.py
+```
+
+### Step 2. Prepare Annotations
+
+We first download the original annotations from the official repository:
+
+```
+wget https://github.com/hangzhaomit/HACS-dataset/raw/master/HACS_v1.1.1.zip
+unzip HACS_v1.1.1.zip
+```
+
+After unzipping, there should be an `HACS_v1.1.1` folder with an `HACS_segments_v1.1.1.json` file in it.
+
+We generate `hacs_anno_train.json`,  `hacs_anno_val.json` and `hacs_anno_test.json` files at `$MMACTION2/data/HACS/`:
+
+```
+python3 generate_anotations.py
+```
+
+After the two steps finished, the folder structure of the HACS Segments dataset should look like:
+
+```
+mmaction2
+├── mmaction
+├── data
+│   ├── HACS
+│   │   ├── hacs_anno_train.json
+│   │   ├── hacs_anno_val.json
+│   │   ├── hacs_anno_test.json
+│   │   ├── slowonly_feature
+│   │   │   ├── v_008gY2B8Pf4.csv
+│   │   │   ├── v_0095rqic1n8.csv
+├── configs
+├── tools
+
+```
diff --git a/tools/data/hacs/generate_anotations.py b/tools/data/hacs/generate_anotations.py
new file mode 100644
index 0000000000..6bcc5bf047
--- /dev/null
+++ b/tools/data/hacs/generate_anotations.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import multiprocessing
+import os
+
+import decord
+
+with open('HACS_v1.1.1/HACS_segments_v1.1.1.json') as f:
+    all_annotations = json.load(f)['database']
+
+
+def parse_anno(key):
+    anno = {}
+    anno['duration_second'] = float(all_annotations[key]['duration'])
+    anno['annotations'] = all_annotations[key]['annotations']
+    anno['subset'] = all_annotations[key]['subset']
+
+    labels = set([i['label'] for i in anno['annotations']])
+    num_frames = int(anno['duration_second'] * 30)
+    for label in labels:
+        path = f'data/{label}/v_{key}.mp4'
+        if os.path.isfile(path):
+            vr = decord.VideoReader(path)
+            num_frames = len(vr)
+            break
+
+    anno['feature_frame'] = anno['duration_frame'] = num_frames
+    anno['key'] = f'v_{key}'
+    return anno
+
+
+pool = multiprocessing.Pool(16)
+video_list = list(all_annotations)
+outputs = pool.map(parse_anno, video_list)
+
+train_anno = {}
+val_anno = {}
+test_anno = {}
+
+for anno in outputs:
+    key = anno.pop('key')
+    subset = anno.pop('subset')
+    if subset == 'training':
+        train_anno[key] = anno
+    elif subset == 'validation':
+        val_anno[key] = anno
+    else:
+        test_anno[key] = anno
+
+outdir = '../../../data/HACS'
+with open(f'{outdir}/hacs_anno_train.json', 'w') as f:
+    json.dump(train_anno, f)
+
+with open(f'{outdir}/hacs_anno_val.json', 'w') as f:
+    json.dump(val_anno, f)
+
+with open(f'{outdir}/hacs_anno_test.json', 'w') as f:
+    json.dump(test_anno, f)
diff --git a/tools/data/hacs/generate_list.py b/tools/data/hacs/generate_list.py
new file mode 100644
index 0000000000..f659c7f29a
--- /dev/null
+++ b/tools/data/hacs/generate_list.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+data_root = './data'
+
+video_list = []
+idx = 0
+for folder in os.listdir(data_root):
+    path = f'{data_root}/{folder}'
+    for video in os.listdir(path):
+        line = f'{folder}/{video} {idx}\n'
+        idx += 1
+        video_list.append(line)
+
+with open('hacs_data.txt', 'w') as f:
+    for line in video_list:
+        f.write(line)
diff --git a/tools/data/hacs/slowonly_feature_infer.py b/tools/data/hacs/slowonly_feature_infer.py
new file mode 100644
index 0000000000..3fce06add4
--- /dev/null
+++ b/tools/data/hacs/slowonly_feature_infer.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+default_hooks = dict(
+    runtime_info=dict(type='RuntimeInfoHook'),
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=20, ignore_last=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=4, save_best='auto'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    sync_buffers=dict(type='SyncBuffersHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='ActionVisualizer', vis_backends=[dict(type='LocalVisBackend')])
+log_level = 'INFO'
+
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        lateral=False,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=2048,
+        num_classes=700,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        average_clips=None),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
+
+data_root = './data'
+ann_file = 'hacs_data.txt'
+
+test_pipeline = [
+    dict(type='DecordInit', io_backend='disk'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=100,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='VideoDataset',
+        ann_file=ann_file,
+        data_prefix=dict(video=data_root),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+test_evaluator = dict(type='DumpResults', out_file_path='result.pkl')
+test_cfg = dict(type='TestLoop')
diff --git a/tools/data/hacs/write_feature_csv.py b/tools/data/hacs/write_feature_csv.py
new file mode 100644
index 0000000000..9fb098b4bf
--- /dev/null
+++ b/tools/data/hacs/write_feature_csv.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine
+
+features = mmengine.load('result.pkl')
+video_list = mmengine.list_from_file('hacs_data.txt')
+feature_dir = '../../../data/HACS/slowonly_feature'
+mmengine.mkdir_or_exist(feature_dir)
+
+head = ','.join([f'f{i}' for i in range(700)]) + '\n'
+
+for feature, video in zip(features, video_list):
+    video_id = video.split()[0].split('/')[1]
+    csv_file = video_id.replace('mp4', 'csv')
+    feat = feature['pred_scores']['item'].numpy()
+    feat = feat.tolist()
+    csv_path = f'{feature_dir}/{csv_file}'
+    with open(csv_path, 'w') as f:
+        f.write(head)
+        for line in feat:
+            f.write(str(line)[1:-1] + '\n')
diff --git a/tools/data/hmdb51/README.md b/tools/data/hmdb51/README.md
index 206b548764..4993f6705b 100644
--- a/tools/data/hmdb51/README.md
+++ b/tools/data/hmdb51/README.md
@@ -41,7 +41,7 @@ bash download_videos.sh
 
 This part is **optional** if you only want to use the video loader.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
 
@@ -122,4 +122,4 @@ mmaction2
 
 ```
 
-For training and evaluating on HMDB51, please refer to [getting_started.md](/docs/getting_started.md).
+For training and evaluating on HMDB51, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/hmdb51/README_zh-CN.md b/tools/data/hmdb51/README_zh-CN.md
index a34c4b9ce9..2da13751cd 100644
--- a/tools/data/hmdb51/README_zh-CN.md
+++ b/tools/data/hmdb51/README_zh-CN.md
@@ -39,7 +39,7 @@ bash download_videos.sh
 
 如果用户只想使用视频加载训练，则该部分是 **可选项**。
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果用户有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 上。
 用户可使用以下命令为 SSD 建立软链接。
@@ -118,4 +118,4 @@ mmaction2
 
 ```
 
-关于对 HMDB51 进行训练和验证，可以参照 [基础教程](/docs_zh_CN/getting_started.md)。
+关于对 HMDB51 进行训练和验证，可以参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/hvu/README.md b/tools/data/hvu/README.md
index 6bcc73f862..dd7952922c 100644
--- a/tools/data/hvu/README.md
+++ b/tools/data/hvu/README.md
@@ -43,7 +43,7 @@ bash download_videos.sh
 
 This part is **optional** if you only want to use the video loader.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 You can use the following script to extract both RGB and Flow frames.
 
@@ -52,7 +52,7 @@ bash extract_frames.sh
 ```
 
 By default, we generate frames with short edge resized to 256.
-More details can be found in [data_preparation](/docs/data_preparation.md)
+More details can be found in [prepare_dataset](/docs/en/user_guides/prepare_dataset.md)
 
 ## Step 4. Generate File List
 
@@ -120,4 +120,4 @@ mmaction2
 
 ```
 
-For training and evaluating on HVU, please refer to [getting_started](/docs/getting_started.md).
+For training and evaluating on HVU, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/hvu/README_zh-CN.md b/tools/data/hvu/README_zh-CN.md
index 5b3ffa1ea3..cbb9613d8e 100644
--- a/tools/data/hvu/README_zh-CN.md
+++ b/tools/data/hvu/README_zh-CN.md
@@ -43,7 +43,7 @@ bash download_videos.sh
 
 如果用户仅使用 video loader，则可以跳过本步。
 
-在提取之前，请参考 [安装教程](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在提取之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 用户可使用如下脚本同时抽取 RGB 帧和光流：
 
@@ -51,7 +51,7 @@ bash download_videos.sh
 bash extract_frames.sh
 ```
 
-该脚本默认生成短边长度为 256 的帧，可参考 [数据准备](/docs_zh_CN/data_preparation.md) 获得更多细节。
+该脚本默认生成短边长度为 256 的帧，可参考 [数据准备](/docs/zh_cn/user_guides/prepare_dataset.md) 获得更多细节。
 
 ## 4. 生成文件列表
 
@@ -107,4 +107,4 @@ mmaction2
 
 ```
 
-关于 HVU 数据集上的训练与测试，请参照 [基础教程](/docs_zh_CN/getting_started.md)。
+关于 HVU 数据集上的训练与测试，请参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/hvu/download.py b/tools/data/hvu/download.py
index 2ab18e8434..29fcd5c3b2 100644
--- a/tools/data/hvu/download.py
+++ b/tools/data/hvu/download.py
@@ -11,7 +11,7 @@
 import subprocess
 import uuid
 
-import mmcv
+import mmengine
 from joblib import Parallel, delayed
 
 ssl._create_default_https_context = ssl._create_unverified_context
@@ -173,7 +173,7 @@ def main(input_csv,
     # Clean tmp dir.
     shutil.rmtree(tmp_dir)
     # Save download report.
-    mmcv.dump(status_lst, 'download_report.json')
+    mmengine.dump(status_lst, 'download_report.json')
 
 
 if __name__ == '__main__':
diff --git a/tools/data/hvu/generate_file_list.py b/tools/data/hvu/generate_file_list.py
index 83e99b1482..ecc4063ffa 100644
--- a/tools/data/hvu/generate_file_list.py
+++ b/tools/data/hvu/generate_file_list.py
@@ -5,7 +5,7 @@
 import os
 import os.path as osp
 
-import mmcv
+import mmengine
 
 annotation_root = '../../data/hvu/annotations'
 tag_file = 'hvu_tags.json'
@@ -109,7 +109,7 @@ def parse_args():
 
 if __name__ == '__main__':
     args = parse_args()
-    tag_cates = mmcv.load(tag_file)
+    tag_cates = mmengine.load(tag_file)
     tag2category = {}
     for k in tag_cates:
         for tag in tag_cates[k]:
@@ -149,4 +149,4 @@ def parse_line(line):
         ]
     elif args.mode == 'videos':
         result = [dict(filename=k[0] + '.mp4', label=k[1]) for k in data_list]
-    mmcv.dump(result, args.output)
+    mmengine.dump(result, args.output)
diff --git a/tools/data/hvu/generate_sub_file_list.py b/tools/data/hvu/generate_sub_file_list.py
index 8313a9b3c9..ee14b82f6b 100644
--- a/tools/data/hvu/generate_sub_file_list.py
+++ b/tools/data/hvu/generate_sub_file_list.py
@@ -2,7 +2,7 @@
 import argparse
 import os.path as osp
 
-import mmcv
+import mmengine
 
 
 def main(annotation_file, category):
@@ -10,7 +10,7 @@ def main(annotation_file, category):
         'action', 'attribute', 'concept', 'event', 'object', 'scene'
     ]
 
-    data = mmcv.load(annotation_file)
+    data = mmengine.load(annotation_file)
     basename = osp.basename(annotation_file)
     dirname = osp.dirname(annotation_file)
     basename = basename.replace('hvu', f'hvu_{category}')
@@ -24,7 +24,7 @@ def main(annotation_file, category):
             item['label'] = label[category]
             result.append(item)
 
-    mmcv.dump(data, target_file)
+    mmengine.dump(data, target_file)
 
 
 if __name__ == '__main__':
diff --git a/tools/data/hvu/parse_tag_list.py b/tools/data/hvu/parse_tag_list.py
index 0871491ef8..b4d0d699bf 100644
--- a/tools/data/hvu/parse_tag_list.py
+++ b/tools/data/hvu/parse_tag_list.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
+import mmengine
 
 tag_list = '../../../data/hvu/annotations/hvu_categories.csv'
 
@@ -13,4 +13,4 @@
 for k in tag_categories:
     tag_categories[k].sort()
 
-mmcv.dump(tag_categories, 'hvu_tags.json')
+mmengine.dump(tag_categories, 'hvu_tags.json')
diff --git a/tools/data/jester/README.md b/tools/data/jester/README.md
index e90841a850..616a29a9a4 100644
--- a/tools/data/jester/README.md
+++ b/tools/data/jester/README.md
@@ -64,7 +64,7 @@ data = dict(
 
 This part is **optional** if you only want to use RGB frames.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
 
@@ -140,4 +140,4 @@ mmaction2
 
 ```
 
-For training and evaluating on Jester, please refer to [getting_started.md](/docs/getting_started.md).
+For training and evaluating on Jester, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/jester/README_zh-CN.md b/tools/data/jester/README_zh-CN.md
index 7660c23ae7..22b4680a7d 100644
--- a/tools/data/jester/README_zh-CN.md
+++ b/tools/data/jester/README_zh-CN.md
@@ -64,7 +64,7 @@ data = dict(
 
 如果用户只想使用 RGB 帧训练，则该部分是 **可选项**。
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。
 
@@ -140,4 +140,4 @@ mmaction2
 
 ```
 
-关于对 jester 进行训练和验证，可以参考 [基础教程](/docs_zh_CN/getting_started.md)。
+关于对 jester 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/kinetics/README.md b/tools/data/kinetics/README.md
index 0df8f8634f..257c650eba 100644
--- a/tools/data/kinetics/README.md
+++ b/tools/data/kinetics/README.md
@@ -15,8 +15,7 @@
 }
 ```
 
-For basic dataset information, please refer to the official [website](https://deepmind.com/research/open-source/open-source-datasets/kinetics/). The scripts can be used for preparing kinetics400, kinetics600, kinetics700. To prepare different version of kinetics, you need to replace `${DATASET}` in the following examples with the specific dataset name. The choices of dataset names are `kinetics400`, `kinetics600` and `kinetics700`.
-Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/${DATASET}/`.
+For basic dataset information, please refer to the official [website](https://deepmind.com/research/open-source/open-source-datasets/kinetics/).
 
 :::{note}
 Because of the expirations of some YouTube links, the sizes of kinetics dataset copies may be different. Here are the sizes of our kinetics dataset copies that used to train all checkpoints.
@@ -29,8 +28,38 @@ Because of the expirations of some YouTube links, the sizes of kinetics dataset
 
 :::
 
+`````{tabs}
+
+````{group-tab} Download by MIM
+:::{note}
+All experiments on Kinetics in MMAction2 are based on this version, we recommend users to try this version.
+:::
+
+MIM supports downloading from OpenDataLab and preprocessing Kinetics-400/600/700 dataset with one command line.
+
+```Bash
+# install OpenDataLab CLI tools
+pip install -U opendatalab
+# log in OpenDataLab
+odl login
+# download and preprocess Kinetics-400 by MIM. Note that this might take a long time.
+mim download mmaction2 --dataset kinetics400
+# download and preprocess Kinetics-600 by MIM. Note that this might take a long time.
+mim download mmaction2 --dataset kinetics600
+# download and preprocess Kinetics-700 by MIM. Note that this might take a long time.
+mim download mmaction2 --dataset kinetics700
+
+```
+
+````
+
+````{group-tab} Download form Official Source
+
 ## Step 1. Prepare Annotations
 
+The scripts can be used for preparing kinetics400, kinetics600, kinetics700. To prepare different version of kinetics, you need to replace `${DATASET}` in the following examples with the specific dataset name. The choices of dataset names are `kinetics400`, `kinetics600` and `kinetics700`.
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/${DATASET}/`.
+
 First of all, you can run the following script to prepare annotations by downloading from the official [website](https://deepmind.com/research/open-source/open-source-datasets/kinetics/).
 
 ```shell
@@ -48,15 +77,6 @@ bash download_backup_annotations.sh ${DATASET}
 
 ## Step 2. Prepare Videos
 
-### Option 1: Download from OpenDataLab
-
-**Recommend**: [OpenDataLab](https://opendatalab.com/) provides the Kinetics dataset ([Kinetics400](https://opendatalab.com/Kinetics-400), [Kinetics600](https://opendatalab.com/Kinetics600), [Kinetics700](https://opendatalab.com/Kinetics_700)), users can download Kinetics dataset with short edge 320 pixels from here.
-
-:::{note}
-All experiments on Kinetics in MMAction2 are based on this version, we recommend users to try this version.
-
-### Option 2: Download from Other Source
-
 you can run the following script to prepare videos.
 The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time.
 
@@ -83,7 +103,7 @@ You can also download from [Academic Torrents](https://academictorrents.com/) ([
 
 This part is **optional** if you only want to use the video loader.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. And you can run the following script to soft link the extracted frames.
 
@@ -114,7 +134,7 @@ bash extract_frames.sh ${DATASET}
 ```
 
 The commands above can generate images with new short edge 256. If you want to generate images with short edge 320 (320p), or with fix size 340x256, you can change the args `--new-short 256` to `--new-short 320` or `--new-width 340 --new-height 256`.
-More details can be found in [data_preparation](/docs/data_preparation.md)
+More details can be found in [prepare dataset](/docs/en/user_guides/prepare_dataset.md).
 
 ## Step 4. Generate File List
 
@@ -126,7 +146,10 @@ bash generate_videos_filelist.sh ${DATASET}
 bash generate_rawframes_filelist.sh ${DATASET}
 ```
 
-## Step 5. Folder Structure
+````
+`````
+
+### Folder Structure
 
 After the whole data pipeline for Kinetics preparation.
 you can get the rawframes (RGB + Flow), videos and annotation files for Kinetics.
@@ -153,9 +176,9 @@ mmaction2
 │   │   │   ├── wrapping_present
 │   │   │   ├── ...
 │   │   │   ├── zumba
-│   │   ├── rawframes_train
-│   │   ├── rawframes_val
+│   │   ├── rawframes_train (optional)
+│   │   ├── rawframes_val (optional)
 
 ```
 
-For training and evaluating on Kinetics, please refer to [getting_started](/docs/getting_started.md).
+For training and evaluating on Kinetics, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/kinetics/README_zh-CN.md b/tools/data/kinetics/README_zh-CN.md
index 86cb65239e..a0d2e858e1 100644
--- a/tools/data/kinetics/README_zh-CN.md
+++ b/tools/data/kinetics/README_zh-CN.md
@@ -15,8 +15,7 @@
 }
 ```
 
-请参照 [官方网站](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) 以获取数据集基本信息。此脚本用于准备数据集 kinetics400，kinetics600，kinetics700。为准备 kinetics 数据集的不同版本，用户需将脚本中的 `${DATASET}` 赋值为数据集对应版本名称，可选项为 `kinetics400`，`kinetics600`， `kinetics700`。
-在开始之前，用户需确保当前目录为 `$MMACTION2/tools/data/${DATASET}/`。
+请参照 [官方网站](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) 以获取数据集基本信息。
 
 :::{note}
 由于部分 YouTube 链接失效，爬取的 Kinetics 数据集大小可能与原版不同。以下是我们所使用 Kinetics 数据集的大小：
@@ -26,9 +25,36 @@
 | Kinetics400 |  240436  |   19796    |
 | Kinetics600 |  383393  |   27910    |
 | Kinetics700 |  542357  |   34824    |
+|     :::     |          |            |
+
+`````{tabs}
+
+````{group-tab} 使用 MIM 下载
+:::{note}
+MMAction2 代码仓库中提供的 Kinetics 实验性能，都是基于这个版本的数据得到的。我们建议用户使用这个版本的 Kinetics 数据集进行实验。
+:::
+
+# MIM 支持下载 Kinetics-400/600/700 数据集。用户可以通过一行命令，从 OpenDataLab 进行下载，并进行预处理。
+```Bash
+# 安装 OpenDataLab CLI 工具
+pip install -U opendatalab
+# 登录 OpenDataLab
+odl login
+# 通过 MIM 进行 Kinetics-400 数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset kinetics400
+# 通过 MIM 进行 Kinetics-600 数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset kinetics600
+# 通过 MIM 进行 Kinetics-700 数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset kinetics700
+```
 
+````
+
+````{group-tab} 从官方源下载
 ## 1. 准备标注文件
 
+此脚本用于准备数据集 kinetics400，kinetics600，kinetics700。为准备 kinetics 数据集的不同版本，用户需将脚本中的 `${DATASET}` 赋值为数据集对应版本名称，可选项为 `kinetics400`，`kinetics600`， `kinetics700`。
+在开始之前，用户需确保当前目录为 `$MMACTION2/tools/data/${DATASET}/`。
 首先，用户可以使用如下脚本从 [Kinetics 数据集官网](https://deepmind.com/research/open-source/open-source-datasets/kinetics/)下载标注文件并进行预处理：
 
 ```shell
@@ -45,15 +71,6 @@ bash download_backup_annotations.sh ${DATASET}
 
 ## 2. 准备视频
 
-### 选项 1: 从 OpenDataLab 下载
-
-**推荐**：[OpenDataLab](https://opendatalab.com/) 提供了 Kinetics 数据集 ([Kinetics400](https://opendatalab.com/Kinetics-400), [Kinetics600](https://opendatalab.com/Kinetics600), [Kinetics700](https://opendatalab.com/Kinetics_700)), 用户可以从这里下载短边长度为 320 的 Kinetics 数据集。
-
-:::{note}
-MMAction2 代码仓库中提供的 Kinetics 实验性能，都是基于这个版本的数据得到的。我们建议用户使用这个版本的 Kinetics 数据集进行实验。
-
-### 选项 2：从其他数据源下载
-
 用户可以使用以下脚本准备视频，视频准备代码修改自 [官方爬虫](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)。注意这一步骤将花费较长时间。
 
 ```shell
@@ -78,7 +95,7 @@ python ../resize_videos.py ../../../data/${DATASET}/videos_train/ ../../../data/
 
 如果用户仅使用 video loader，则可以跳过本步。
 
-在提取之前，请参考 [安装教程](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在提取之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果用户有足够的 SSD 空间，那么建议将视频抽取为 RGB 帧以提升 I/O 性能。用户可以使用以下脚本为抽取得到的帧文件夹建立软连接：
 
@@ -109,7 +126,7 @@ bash extract_frames.sh ${DATASET}
 ```
 
 以上的命令生成短边长度为 256 的 RGB 帧和光流帧。如果用户需要生成短边长度为 320 的帧 (320p)，或是固定分辨率为 340 x 256 的帧，可改变参数 `--new-short 256` 为 `--new-short 320` 或 `--new-width 340 --new-height 256`。
-更多细节可以参考 [数据准备](/docs_zh_CN/data_preparation.md)。
+更多细节可以参考 [数据准备](/docs/zh_cn/user_guides/prepare_dataset.md)。
 
 ## 4. 生成文件列表
 
@@ -121,7 +138,10 @@ bash generate_videos_filelist.sh ${DATASET}
 bash generate_rawframes_filelist.sh ${DATASET}
 ```
 
-## 5. 目录结构
+````
+`````
+
+### 目录结构
 
 在完整完成 Kinetics 的数据处理后，将得到帧文件夹（RGB 帧和光流帧），视频以及标注文件。
 
@@ -136,7 +156,7 @@ mmaction2
 │   ├── ${DATASET}
 │   │   ├── ${DATASET}_train_list_videos.txt
 │   │   ├── ${DATASET}_val_list_videos.txt
-│   │   ├── annotations
+│   │   ├── annotations（可选）
 │   │   ├── videos_train
 │   │   ├── videos_val
 │   │   │   ├── abseiling
@@ -146,9 +166,9 @@ mmaction2
 │   │   │   ├── wrapping_present
 │   │   │   ├── ...
 │   │   │   ├── zumba
-│   │   ├── rawframes_train
-│   │   ├── rawframes_val
+│   │   ├── rawframes_train（可选）
+│   │   ├── rawframes_val（可选）
 
 ```
 
-关于 Kinetics 数据集上的训练与测试，请参照 [基础教程](/docs_zh_CN/getting_started.md)。
+关于 Kinetics 数据集上的训练与测试，请参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/kinetics/preprocess_k400.sh b/tools/data/kinetics/preprocess_k400.sh
new file mode 100644
index 0000000000..9f07885095
--- /dev/null
+++ b/tools/data/kinetics/preprocess_k400.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -x
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/Kinetics-400/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
+mv $(dirname $DATA_ROOT)/Kinetics-400 $DATA_ROOT
diff --git a/tools/data/kinetics/preprocess_k600.sh b/tools/data/kinetics/preprocess_k600.sh
new file mode 100644
index 0000000000..438297a620
--- /dev/null
+++ b/tools/data/kinetics/preprocess_k600.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -x
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/Kinetics600/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
+mv $(dirname $DATA_ROOT)/Kinetics600 $DATA_ROOT
diff --git a/tools/data/kinetics/preprocess_k700.sh b/tools/data/kinetics/preprocess_k700.sh
new file mode 100644
index 0000000000..930bf8577a
--- /dev/null
+++ b/tools/data/kinetics/preprocess_k700.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -x
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/Kinetics_700/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
+mv $(dirname $DATA_ROOT)/Kinetics_700 $DATA_ROOT
diff --git a/tools/data/kinetics710/README.md b/tools/data/kinetics710/README.md
new file mode 100644
index 0000000000..76a239fbf5
--- /dev/null
+++ b/tools/data/kinetics710/README.md
@@ -0,0 +1,91 @@
+# Preparing Kinetics-710
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{li2022uniformerv2,
+      title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer},
+      author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Yu Qiao},
+      year={2022},
+      eprint={2211.09552},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+For basic dataset information, please refer to the [paper](https://arxiv.org/pdf/2211.09552.pdf). The scripts can be used for preparing kinetics-710. MMAction2 supports Kinetics-710
+dataset as a concat dataset, which means only provides a list of annotation files, and makes use of the original data of Kinetics-400/600/700 dataset. You could refer to the [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py)
+for details, which also provides a template config about how to use concat dataset in MMAction2.
+Before we start, please make sure that the directory is located at `$MMACTION2`.
+
+## Step 1. Download Kinetics 400/600/700
+
+Kinetics-710 is a video benchmark based on Kinetics-400/600/700, which merges the training set of these Kinetics datasets, and deletes the repeated videos according to Youtube IDs. MMAction2 provides an annotation file based on the Kinetics-400/600/700 on [OpenDataLab](https://opendatalab.com/). So we suggest you download Kinetics-400/600/700 first from OpenDataLab by [MIM](https://github.com/open-mmlab/mim).
+
+```shell
+# install OpenDataLab CLI tools
+pip install -U opendatalab
+# log in OpenDataLab
+odl login
+# download Kinetics-400/600/700, note that this might take a long time.
+mim download mmaction2 --dataset kinetics400
+mim download mmaction2 --dataset kinetics600
+mim download mmaction2 --dataset kinetics700
+
+```
+
+## Step 2. Download Kinetics-710 Annotations
+
+We provide the annotation list of Kinetics-710 corresponding to OpenDataLab version Kinetics, you could download it from aliyun and unzip it to the `$MMACTION2/data/`
+
+```shell
+wget -P data https://download.openmmlab.com/mmaction/dataset/kinetics710/annotations.zip
+cd data && unzip annotations.zip && cd ..
+
+```
+
+## Step 3. Folder Structure
+
+After the whole data pipeline for Kinetics preparation.
+you can get the videos and annotation files for Kinetics-710.
+
+In the context of the whole project (for Kinetics only), the *minimal* folder structure will look like:
+(*minimal* means that some data are not necessary: for example, you may want to evaluate kinetics using the original video format.)
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── kinetics400
+│   │   ├── videos_train
+│   │   ├── videos_val
+│   │   │   ├── jf7RDuUTrsQ.mp4
+│   │   │   ├── ...
+│   ├── kinetics600
+│   │   ├── videos
+│   │   │   ├── vol_00
+│   │   │   │   ├── -A5JFdMXB_k_000018_000028.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── vol63
+│   ├── kinetics700
+│   │   ├── videos
+│   │   │   ├── vol_00
+│   │   │   │   ├── -Paa0R0tQ1w_000009_000019.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── vol63
+│   ├── kinetics710
+│   │   ├── k400_train_list_videos.txt
+│   │   ├── k400_val_list_videos.txt
+│   │   ├── k600_train_list_videos.txt
+│   │   ├── k600_val_list_videos.txt
+│   │   ├── k700_train_list_videos.txt
+│   │   ├── k700_val_list_videos.txt
+```
+
+For training and evaluating on Kinetics, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/kinetics710/README_zh-CN.md b/tools/data/kinetics710/README_zh-CN.md
new file mode 100644
index 0000000000..b428be43df
--- /dev/null
+++ b/tools/data/kinetics710/README_zh-CN.md
@@ -0,0 +1,89 @@
+# 准备 Kinetics-710
+
+## 介绍
+
+<!-- [DATASET] -->
+
+```BibTeX
+@misc{li2022uniformerv2,
+      title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer},
+      author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Yu Qiao},
+      year={2022},
+      eprint={2211.09552},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+关于基本数据集信息，请参考 [论文](https://arxiv.org/pdf/2211.09552.pdf)。这些脚本可以用于准备 kinetics-710。MMAction2 以 Concat Daataset 的形式支持了 Kinetics-710 数据集，我们只提供一个注释文件列表，并利用 Kinetics-400/600/700 数据集的原始数据。你可以参考 [配置](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py) 了解详情，它也提供了一个模板配置，说明了如何在 MMAction2 中使用 Concat Dataset。
+在我们开始之前，请确保目录位于 `$MMACTION2`。
+
+## 第一步：下载 Kinetics 400/600/700
+
+Kinetics-710 是基于 Kinetics-400/600/700 的视频数据集，它合并了这些 Kinetics 数据集的训练集，并根据 Youtube ID 删除了重复的视频。MMAction2 提供了一个基于 Kinetics-400/600/700 的 OpenDataLab 版本的标注文件，你可以通过 [MIM](https://github.com/open-mmlab/mim) 从 OpenDataLab 下载。
+
+```shell
+# 安装 OpenDataLab CLI 工具
+pip install -U opendatalab
+# 登录 OpenDataLab
+odl login
+# 下载 Kinetics-400/600/700，注意这可能需要很长时间。
+mim download mmaction2 --dataset kinetics400
+mim download mmaction2 --dataset kinetics600
+mim download mmaction2 --dataset kinetics700
+
+```
+
+## 第二步：下载 Kinetics-710 标注文件
+
+我们提供了与 OpenDataLab 版本 Kinetics 相对应的 Kinetics-710 标注文件列表，你可以从阿里云下载它，并将其解压到 `$MMACTION2/data/`
+
+```shell
+wget -P data https://download.openmmlab.com/mmaction/dataset/kinetics710/annotations.zip
+cd data && unzip annotations.zip && cd ..
+
+```
+
+## 第三步：文件夹结构
+
+完成 Kinetics 准备的整个数据流程后。
+你可以得到 Kinetics-710 的视频和注释文件。
+
+在整个项目目录下（仅针对 Kinetics），*最小*的文件夹结构如下：
+（*最小*意味着一些数据是不必要的：例如，你可能想要使用原始视频格式评估 kinetics。）
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── kinetics400
+│   │   ├── videos_train
+│   │   ├── videos_val
+│   │   │   ├── jf7RDuUTrsQ.mp4
+│   │   │   ├── ...
+│   ├── kinetics600
+│   │   ├── videos
+│   │   │   ├── vol_00
+│   │   │   │   ├── -A5JFdMXB_k_000018_000028.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── vol63
+│   ├── kinetics700
+│   │   ├── videos
+│   │   │   ├── vol_00
+│   │   │   │   ├── -Paa0R0tQ1w_000009_000019.mp4
+│   │   │   │   ├── ...
+│   │   │   ├── ...
+│   │   │   ├── vol63
+│   ├── kinetics710
+│   │   ├── k400_train_list_videos.txt
+│   │   ├── k400_val_list_videos.txt
+│   │   ├── k600_train_list_videos.txt
+│   │   ├── k600_val_list_videos.txt
+│   │   ├── k700_train_list_videos.txt
+│   │   ├── k700_val_list_videos.txt
+```
+
+关于在 Kinetics 上进行训练和评估，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/kinetics710/label_map_k710.txt b/tools/data/kinetics710/label_map_k710.txt
new file mode 100644
index 0000000000..5b3f53cedd
--- /dev/null
+++ b/tools/data/kinetics710/label_map_k710.txt
@@ -0,0 +1,710 @@
+abseiling
+air drumming
+answering questions
+applauding
+applying cream
+archery
+arm wrestling
+arranging flowers
+assembling computer
+auctioning
+baby waking up
+baking cookies
+inflating balloons
+bandaging
+barbequing
+bartending
+beatboxing
+bee keeping
+belly dancing
+bench pressing
+bending back
+bending metal
+biking through snow
+blasting sand
+blowing glass
+blowing leaves
+blowing nose
+blowing out candles
+bobsledding
+bookbinding
+bouncing on trampoline
+bowling
+braiding hair
+breading or breadcrumbing
+breakdancing
+brush painting
+brushing hair
+brushing teeth
+building cabinet
+building shed
+bungee jumping
+busking
+canoeing or kayaking
+capoeira
+carrying baby
+cartwheeling
+carving pumpkin
+catching fish
+catching or throwing baseball
+catching or throwing frisbee
+catching or throwing softball
+celebrating
+changing oil
+changing wheel (not on bike)
+checking tires
+cheerleading
+chopping wood
+clapping
+clay pottery making
+clean and jerk
+cleaning floor
+cleaning gutters
+cleaning pool
+cleaning shoes
+cleaning toilet
+cleaning windows
+climbing a rope
+climbing ladder
+climbing tree
+contact juggling
+cooking chicken
+cooking egg
+cooking on campfire
+cooking sausages (not on barbeque)
+counting money
+country line dancing
+cracking neck
+crawling baby
+crossing river
+crying
+curling hair
+cutting nails
+cutting pineapple
+cutting watermelon
+dancing ballet
+dancing charleston
+dancing gangnam style
+dancing macarena
+deadlifting
+decorating the christmas tree
+digging
+dining
+disc golfing
+diving cliff
+dodgeball
+doing aerobics
+doing laundry
+doing nails
+drawing
+dribbling basketball
+sipping cup
+drinking beer
+drinking shots
+driving car
+driving tractor
+drop kicking
+drumming fingers
+dunking basketball
+dyeing hair
+eating burger
+eating cake
+eating carrots
+eating chips
+eating doughnuts
+eating hotdog
+eating ice cream
+eating spaghetti
+eating watermelon
+egg hunting
+exercising arm
+exercising with an exercise ball
+extinguishing fire
+faceplanting
+feeding birds
+feeding fish
+feeding goats
+filling eyebrows
+finger snapping
+fixing hair
+flipping pancake
+flying kite
+folding clothes
+folding napkins
+folding paper
+front raises
+frying vegetables
+person collecting garbage
+gargling
+getting a haircut
+getting a tattoo
+giving or receiving award
+golf chipping
+golf driving
+golf putting
+grinding meat
+grooming dog
+grooming horse
+gymnastics tumbling
+hammer throw
+headbanging
+headbutting
+high jump
+high kick
+hitting baseball
+hockey stop
+holding snake
+hopscotch
+hoverboarding
+hugging (not baby)
+hula hooping
+hurdling
+hurling (sport)
+ice climbing
+ice fishing
+ice skating
+ironing
+javelin throw
+jetskiing
+jogging
+juggling balls
+juggling fire
+juggling soccer ball
+jumping into pool
+jumpstyle dancing
+kicking field goal
+kicking soccer ball
+kissing
+kitesurfing
+knitting
+krumping
+laughing
+laying bricks
+long jump
+lunge
+making a cake
+making a sandwich
+making the bed
+making jewelry
+making pizza
+making snowman
+making sushi
+making tea
+marching
+massaging back
+massaging feet
+massaging legs
+massaging person's head
+milking cow
+mopping floor
+motorcycling
+moving furniture
+mowing lawn
+news anchoring
+opening bottle (not wine)
+opening present
+paragliding
+parasailing
+parkour
+passing American football (in game)
+passing American football (not in game)
+peeling apples
+peeling potatoes
+petting animal (not cat)
+petting cat
+picking apples
+planting trees
+plastering
+playing accordion
+playing badminton
+playing bagpipes
+playing basketball
+playing bass guitar
+playing cards
+playing cello
+playing chess
+playing clarinet
+playing controller
+playing cricket
+playing cymbals
+playing didgeridoo
+playing drums
+playing flute
+playing guitar
+playing harmonica
+playing harp
+playing ice hockey
+playing keyboard
+playing kickball
+playing monopoly
+playing organ
+playing paintball
+playing piano
+playing poker
+playing recorder
+playing saxophone
+playing squash or racquetball
+playing tennis
+playing trombone
+playing trumpet
+playing ukulele
+playing violin
+playing volleyball
+playing xylophone
+pole vault
+presenting weather forecast
+pull ups
+pumping fist
+pumping gas
+punching bag
+punching person (boxing)
+push up
+pushing car
+pushing cart
+pushing wheelchair
+reading book
+reading newspaper
+recording music
+riding a bike
+riding camel
+riding elephant
+riding mechanical bull
+riding mountain bike
+riding mule
+riding or walking with horse
+riding scooter
+riding unicycle
+ripping paper
+robot dancing
+rock climbing
+rock scissors paper
+roller skating
+running on treadmill
+sailing
+salsa dancing
+sanding floor
+scrambling eggs
+scuba diving
+setting table
+shaking hands
+shaking head
+sharpening knives
+sharpening pencil
+shaving head
+shaving legs
+shearing sheep
+shining shoes
+shooting basketball
+shooting goal (soccer)
+shot put
+shoveling snow
+shredding paper
+shuffling cards
+side kick
+sign language interpreting
+singing
+situp
+skateboarding
+ski jumping
+skiing mono
+skiing crosscountry
+skiing slalom
+skipping rope
+skydiving
+slacklining
+slapping
+sled dog racing
+smoking
+smoking hookah
+snatch weight lifting
+sneezing
+smelling feet
+snorkeling
+snowboarding
+snowkiting
+snowmobiling
+somersaulting
+spinning poi
+spray painting
+spraying
+springboard diving
+squat
+sticking tongue out
+stomping grapes
+stretching arm
+stretching leg
+strumming guitar
+surfing crowd
+surfing water
+sweeping floor
+swimming backstroke
+swimming breast stroke
+swimming butterfly stroke
+swing dancing
+swinging legs
+swinging on something
+sword fighting
+tai chi
+taking a shower
+tango dancing
+tap dancing
+tapping guitar
+tapping pen
+tasting beer
+tasting food
+testifying
+texting
+throwing axe
+throwing ball
+throwing discus
+tickling
+tobogganing
+tossing coin
+tossing salad
+training dog
+trapezing
+trimming or shaving beard
+trimming trees
+triple jump
+tying bow tie
+tying knot (not on a tie)
+tying necktie
+unboxing
+unloading truck
+using computer
+using remote controller (not gaming)
+using segway
+vault
+waiting in line
+walking the dog
+washing dishes
+washing feet
+washing hair
+washing hands
+water skiing
+water sliding
+watering plants
+waxing back
+waxing chest
+waxing eyebrows
+waxing legs
+weaving basket
+welding
+whistling
+windsurfing
+wrapping present
+wrestling
+writing
+yawning
+yoga
+zumba
+poaching eggs
+playing nose flute
+entering church
+closing door
+helmet diving
+doing sudoku
+coughing
+seasoning food
+peeling banana
+eating nachos
+waxing armpits
+shouting
+silent disco
+polishing furniture
+taking photo
+dealing cards
+putting wallpaper on wall
+uncorking champagne
+curling eyelashes
+brushing floor
+pulling espresso shot
+playing american football
+grooming cat
+playing checkers
+moving child
+stacking cups
+squeezing orange
+opening coconuts
+rolling eyes
+picking blueberries
+playing road hockey
+carving wood with a knife
+slicing onion
+saluting
+letting go of balloon
+breaking glass
+carrying weight
+mixing colours
+moving baby
+blending fruit
+pouring milk
+surveying
+making slime
+sieving
+walking with crutches
+flipping bottle
+playing billiards
+arresting
+listening with headphones
+spinning plates
+carving marble
+cutting cake
+shoot dance
+being excited
+petting horse
+splashing water
+filling cake
+stacking dice
+checking watch
+treating wood
+laying decking
+shooting off fireworks
+pouring wine
+pretending to be a statue
+steering car
+playing rounders
+looking in mirror
+jumping sofa
+lighting candle
+walking on stilts
+crocheting
+playing piccolo
+vacuuming car
+high fiving
+playing shuffleboard
+chasing
+pulling rope (game)
+being in zero gravity
+sanding wood
+decoupage
+using megaphone
+making latte art
+ski ballet
+playing oboe
+bouncing ball (not juggling)
+playing mahjong
+herding cattle
+swimming with sharks
+milking goat
+swimming with dolphins
+metal detecting
+playing slot machine
+polishing metal
+throwing tantrum
+lawn mower racing
+laying stone
+cutting orange
+skipping stone
+pouring beer
+making bubbles
+jaywalking
+leatherworking
+card stacking
+putting on eyeliner
+card throwing
+chewing gum
+falling off bike
+repairing puncture
+dumpster diving
+tiptoeing
+sleeping
+using circular saw
+cracking knuckles
+pinching
+chiseling wood
+playing rubiks cube
+weaving fabric
+fencing (sport)
+sword swallowing
+lighting fire
+vacuuming floor
+combing hair
+building lego
+playing pinball
+fly tying
+playing lute
+opening door
+waving hand
+rolling pastry
+chiseling stone
+threading needle
+playing dominoes
+opening wine bottle
+playing with trains
+steer roping
+playing field hockey
+separating eggs
+sewing
+talking on cell phone
+needle felting
+pushing wheelbarrow
+using a paint roller
+playing netball
+lifting hat
+massaging neck
+blowing bubble gum
+walking through snow
+docking boat
+clam digging
+marriage proposal
+packing
+sausage making
+licking
+scrapbooking
+flint knapping
+lock picking
+putting on lipstick
+sawing wood
+playing hand clapping games
+geocaching
+looking at phone
+making cheese
+poking bellybutton
+contorting
+fixing bicycle
+using a microscope
+using a wrench
+doing jigsaw puzzle
+making horseshoes
+cooking scallops
+square dancing
+getting a piercing
+playing ocarina
+making paper aeroplanes
+playing scrabble
+visiting the zoo
+crossing eyes
+jumping bicycle
+throwing water balloon
+bodysurfing
+pirouetting
+luge
+spelunking
+watching tv
+attending conference
+curling (sport)
+directing traffic
+swimming front crawl
+ice swimming
+battle rope training
+putting on mascara
+bouncing on bouncy castle
+smoking pipe
+pillow fight
+putting on sari
+calligraphy
+roasting pig
+cracking back
+shopping
+burping
+using bagging machine
+staring
+shucking oysters
+blowdrying hair
+smashing
+playing laser tag
+wading through mud
+rope pushdown
+preparing salad
+making balloon shapes
+tagging graffiti
+adjusting glasses
+using a power drill
+trimming shrubs
+popping balloons
+playing pan pipes
+using puppets
+arguing
+backflip (human)
+riding snow blower
+hand washing clothes
+calculating
+gospel singing in church
+standing on hands
+tasting wine
+shaping bread dough
+wading through water
+falling off chair
+throwing snowballs
+building sandcastle
+land sailing
+tying shoe laces
+jumping jacks
+wood burning (art)
+putting on foundation
+putting on shoes
+cumbia
+archaeological excavation
+mountain climber (exercise)
+assembling bicycle
+head stand
+cutting apple
+shuffling feet
+bottling
+breathing fire
+using inhaler
+historical reenactment
+hugging baby
+mushroom foraging
+delivering mail
+laying tiles
+using atm
+chopping meat
+tightrope walking
+mosh pit dancing
+photobombing
+coloring in
+huddling
+playing gong
+laying concrete
+breaking boards
+acting in play
+base jumping
+tie dying
+using a sledge hammer
+playing ping pong
+photocopying
+winking
+waking up
+swinging baseball bat
+twiddling fingers
+playing polo
+longboarding
+ironing hair
+bathing dog
+moon walking
+playing marbles
+embroidering
+playing beer pong
+home roasting coffee
+gold panning
+karaoke
+changing gear in car
+raising eyebrows
+yarn spinning
+scrubbing face
+fidgeting
+planing wood
+cosplaying
+capsizing
+tackling
+shining flashlight
+dyeing eyebrows
+drooling
+alligator wrestling
+playing blackjack
+carving ice
+playing maracas
+opening refrigerator
+throwing knife
+putting in contact lenses
+passing soccer ball
+casting fishing line
+sucking lolly
+installing carpet
+bulldozing
+roasting marshmallows
+playing darts
+chopping vegetables
+bull fighting
diff --git a/tools/data/mit/README.md b/tools/data/mit/README.md
index e67ca45335..69828024d9 100644
--- a/tools/data/mit/README.md
+++ b/tools/data/mit/README.md
@@ -34,7 +34,7 @@ python ../resize_videos.py ../../../data/mit/videos/ ../../../data/mit/videos_25
 
 This part is **optional** if you only want to use the video loader.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. And you can run the following script to soft link the extracted frames.
 
@@ -125,4 +125,4 @@ mmaction2
 
 ```
 
-For training and evaluating on Moments in Time, please refer to [getting_started.md](/docs/getting_started.md).
+For training and evaluating on Moments in Time, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/mit/README_zh-CN.md b/tools/data/mit/README_zh-CN.md
index 74a3d0c247..1eaa6eafb2 100644
--- a/tools/data/mit/README_zh-CN.md
+++ b/tools/data/mit/README_zh-CN.md
@@ -36,7 +36,7 @@ python ../resize_videos.py ../../../data/mit/videos/ ../../../data/mit/videos_25
 
 如果用户只想使用视频加载训练，则该部分是 **可选项**。
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果用户有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 上。
 用户可使用以下命令为 SSD 建立软链接。
@@ -127,4 +127,4 @@ mmaction2
 
 ```
 
-关于对 Moments in Times 进行训练和验证，可以参照 [基础教程](/docs_zh_CN/getting_started.md)。
+关于对 Moments in Times 进行训练和验证，可以参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/mmit/README.md b/tools/data/mmit/README.md
index 5deedf71d0..cadd8ab705 100644
--- a/tools/data/mmit/README.md
+++ b/tools/data/mmit/README.md
@@ -32,7 +32,7 @@ python ../resize_videos.py ../../../data/mmit/videos/ ../../../data/mmit/videos_
 
 This part is **optional** if you only want to use the video loader.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 First, you can run the following script to soft link SSD.
 
@@ -110,4 +110,4 @@ mmaction2/
             └── ...
 ```
 
-For training and evaluating on Multi-Moments in Time, please refer to [getting_started.md](/docs/getting_started.md).
+For training and evaluating on Multi-Moments in Time, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/mmit/README_zh-CN.md b/tools/data/mmit/README_zh-CN.md
index e070505e34..231a5aac68 100644
--- a/tools/data/mmit/README_zh-CN.md
+++ b/tools/data/mmit/README_zh-CN.md
@@ -34,7 +34,7 @@ python ../resize_videos.py ../../../data/mmit/videos/ ../../../data/mmit/videos_
 
 如果用户只想使用视频加载训练，则该部分是 **可选项**。
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果用户有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 上。
 用户可使用以下命令为 SSD 建立软链接。
@@ -112,4 +112,4 @@ mmaction2/
             └── ...
 ```
 
-关于对 Multi-Moments in Time 进行训练和验证，可以参照 [基础教程](/docs_zh_CN/getting_started.md)。
+关于对 Multi-Moments in Time 进行训练和验证，可以参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/multisports/README.md b/tools/data/multisports/README.md
new file mode 100644
index 0000000000..98fe25698f
--- /dev/null
+++ b/tools/data/multisports/README.md
@@ -0,0 +1,111 @@
+# Preparing Multisports
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{li2021multisports,
+  title={Multisports: A multi-person video dataset of spatio-temporally localized sports actions},
+  author={Li, Yixuan and Chen, Lei and He, Runyu and Wang, Zhenzhi and Wu, Gangshan and Wang, Limin},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={13536--13545},
+  year={2021}
+}
+```
+
+For basic dataset information, please refer to the official [project](https://deeperaction.github.io/datasets/multisports.html) and the [paper](https://arxiv.org/abs/2105.07404).
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/multisports/`.
+
+## Step 1. Prepare Annotations
+
+First of all, you have to download annotations and videos to `$MMACTION2/data/multisports` on the official [website](https://github.com/MCG-NJU/MultiSports), please also download the Person Boxes and put it to `$MMACTION2/data/multisports`.
+
+## Step 2. Prepare Videos
+
+Before this step, please make sure the folder structure looks like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── multisports
+│   |   ├── MultiSports_box.zip
+│   |   ├── trainval
+│   |   |   ├── aerobic_gymnastics.zip
+│   |   |   ├── basketball.zip
+│   |   |   ├── multisports_GT.pkl
+│   |   |   ├──...
+│   |   ├── test
+│   |   |   ├── aerobic_gymnastics.zip
+│   |   |   ├── basketball.zip
+│   |   |   ├──...
+```
+
+Then, you can use the following command to uncompress.
+
+```shell
+cd $MMACTION2/data/multisports/
+unzip MultiSports_box.zip
+cd $MMACTION2/data/multisports/trainval
+find . -name '*.zip' -exec unzip {} \;
+cd $MMACTION2/data/multisports/test
+find . -name '*.zip' -exec unzip {} \;
+cd $MMACTION2/tools/data/multisports/
+```
+
+## Step 3. Convert Annotations
+
+you can run the following script to convert annotations and proposals as we need.
+
+```shell
+cd $MMACTION2/tools/data/multisports/
+python parse_anno.py
+```
+
+## Step 5. Check Directory Structure
+
+After the whole data process, you will get the videos and annotation files for MultiSports.
+
+In the context of the whole project (for MultiSports only), the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── multisports
+│   |   ├── annotations
+|   │   |   ├── multisports_dense_proposals_test.recall_96.13.pkl
+|   │   |   ├── multisports_dense_proposals_train.recall_96.13.pkl
+|   │   |   ├── multisports_dense_proposals_val.recall_96.13.pkl
+|   │   |   ├── multisports_GT.pkl
+|   │   |   ├── multisports_train.csv
+|   │   |   ├── multisports_val.csv
+│   |   ├── trainval
+│   |   |   ├── aerobic_gymnastics
+|   │   |   |   ├── v__wAgwttPYaQ_c001.mp4
+|   │   |   |   ├── v__wAgwttPYaQ_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── basketball
+|   │   |   |   ├── v_-6Os86HzwCs_c001.mp4
+|   │   |   |   ├── v_-6Os86HzwCs_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── multisports_GT.pkl
+│   |   |   ├──...
+│   |   ├── test
+│   |   |   ├── aerobic_gymnastics
+|   │   |   |   ├── v_2KroSzspz-c_c001.mp4
+|   │   |   |   ├── v_2KroSzspz-c_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── basketball
+|   │   |   |   ├── v_1tefH1iPbGM_c001.mp4
+|   │   |   |   ├── v_1tefH1iPbGM_c002.mp4
+│   |   |   ├──...
+```
+
+We don't need the zip files under the project, you can handle them as you want.
+For training and evaluating on MultiSports, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/multisports/README_zh-CN.md b/tools/data/multisports/README_zh-CN.md
new file mode 100644
index 0000000000..3e0470f12e
--- /dev/null
+++ b/tools/data/multisports/README_zh-CN.md
@@ -0,0 +1,111 @@
+# 准备 MultiSports
+
+## 介绍
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{li2021multisports,
+  title={Multisports: A multi-person video dataset of spatio-temporally localized sports actions},
+  author={Li, Yixuan and Chen, Lei and He, Runyu and Wang, Zhenzhi and Wu, Gangshan and Wang, Limin},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={13536--13545},
+  year={2021}
+}
+```
+
+关于基本数据集信息，请参考官方 [项目](https://deeperaction.github.io/datasets/multisports.html) 和 [论文](https://arxiv.org/abs/2105.07404)。
+在我们开始之前，请确保目录位于 `$MMACTION2/tools/data/multisports/`。
+
+## 第一步：准备标注
+
+首先，你必须从官方 [网站](https://github.com/MCG-NJU/MultiSports) 下载标注和视频到 `$MMACTION2/data/multisports`，请同时下载人物检测框并将其放到 `$MMACTION2/data/multisports`。
+
+## 第二步：准备视频
+
+在这一步之前，请确保文件夹结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── multisports
+│   |   ├── MultiSports_box.zip
+│   |   ├── trainval
+│   |   |   ├── aerobic_gymnastics.zip
+│   |   |   ├── basketball.zip
+│   |   |   ├── multisports_GT.pkl
+│   |   |   ├──...
+│   |   ├── test
+│   |   |   ├── aerobic_gymnastics.zip
+│   |   |   ├── basketball.zip
+│   |   |   ├──...
+```
+
+然后，你可以使用以下命令进行解压。
+
+```shell
+cd $MMACTION2/data/multisports/
+unzip MultiSports_box.zip
+cd $MMACTION2/data/multisports/trainval
+find . -name '*.zip' -exec unzip {} \;
+cd $MMACTION2/data/multisports/test
+find . -name '*.zip' -exec unzip {} \;
+cd $MMACTION2/tools/data/multisports/
+```
+
+## 第三步：转换标注文件
+
+你可以运行以下脚本来转换我们需要的标注文件和候选框。
+
+```shell
+cd $MMACTION2/tools/data/multisports/
+python parse_anno.py
+```
+
+## 第五步：检查目录结构
+
+完成整个数据处理后，你将得到 MultiSports 数据集的视频和标注文件。
+
+在整个项目的目录中（仅针对 MultiSports），文件夹结构如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── multisports
+│   |   ├── annotations
+|   │   |   ├── multisports_dense_proposals_test.recall_96.13.pkl
+|   │   |   ├── multisports_dense_proposals_train.recall_96.13.pkl
+|   │   |   ├── multisports_dense_proposals_val.recall_96.13.pkl
+|   │   |   ├── multisports_GT.pkl
+|   │   |   ├── multisports_train.csv
+|   │   |   ├── multisports_val.csv
+│   |   ├── trainval
+│   |   |   ├── aerobic_gymnastics
+|   │   |   |   ├── v__wAgwttPYaQ_c001.mp4
+|   │   |   |   ├── v__wAgwttPYaQ_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── basketball
+|   │   |   |   ├── v_-6Os86HzwCs_c001.mp4
+|   │   |   |   ├── v_-6Os86HzwCs_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── multisports_GT.pkl
+│   |   |   ├──...
+│   |   ├── test
+│   |   |   ├── aerobic_gymnastics
+|   │   |   |   ├── v_2KroSzspz-c_c001.mp4
+|   │   |   |   ├── v_2KroSzspz-c_c002.mp4
+|   │   |   |   ├── ...
+│   |   |   ├── basketball
+|   │   |   |   ├── v_1tefH1iPbGM_c001.mp4
+|   │   |   |   ├── v_1tefH1iPbGM_c002.mp4
+│   |   |   ├──...
+```
+
+我们不需要项目下的 zip 文件，你可以按照自己的意愿处理它们。
+关于在 MultiSports 上进行训练和评估，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/multisports/format_det_result.py b/tools/data/multisports/format_det_result.py
new file mode 100644
index 0000000000..84fd78811e
--- /dev/null
+++ b/tools/data/multisports/format_det_result.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+import numpy as np
+from mmengine import dump, load
+from rich.progress import track
+
+from mmaction.evaluation import link_tubes
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('test-result', help='path of dumped reuslts')
+    parser.add_argument(
+        '--anno-path',
+        default='data/multisports/videos/trainval/multisports_GT.pkl')
+    parser.add_argument(
+        '--frm_out_path',
+        default=None,
+        help='frame-level detection results output path')
+    parser.add_argument(
+        '--tube_out_path',
+        default=None,
+        help='tube-level detection results output path')
+    args = parser.parse_args()
+    if not args.frm_out_path:
+        args.frm_out_path = args.test_result[:-4] + '-formated.pkl'
+    if not args.tube_out_path:
+        args.tube_out_path = args.test_result[:-4] + '_vid_dets.pkl'
+    return args
+
+
+def format_det_result():
+    """convert test results to specified format in MultiSports competition."""
+    test_results = load(args.test_result)
+    annos = load(args.anno_path)
+    test_videos = annos['test_videos'][0]
+    resolutions = annos['resolution']
+    frm_dets = []
+    for pred in track(test_results, description='formating...'):
+        video_key = pred['video_id'].split('.mp4')[0]
+        frm_num = pred['timestamp']
+        bboxes = pred['pred_instances']['bboxes']
+        cls_scores = pred['pred_instances']['scores']
+        for bbox, cls_score in zip(bboxes, cls_scores):
+            video_idx = test_videos.index(video_key)
+            pred_label = np.argmax(cls_score)
+            score = cls_score[pred_label]
+            h, w = resolutions[video_key]
+            bbox *= np.array([w, h, w, h])
+            instance_result = np.array(
+                [video_idx, frm_num, pred_label, score, *bbox])
+            frm_dets.append(instance_result)
+    frm_dets = np.array(frm_dets)
+    video_tubes = link_tubes(annos, frm_dets, K=1)
+    dump(frm_dets, args.frm_out_path)
+    dump(video_tubes, args.tube_out_path)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    format_det_result()
diff --git a/tools/data/multisports/label_map.txt b/tools/data/multisports/label_map.txt
new file mode 100644
index 0000000000..b710721362
--- /dev/null
+++ b/tools/data/multisports/label_map.txt
@@ -0,0 +1,66 @@
+0: aerobic_push_up
+1: aerobic_explosive_push_up
+2: aerobic_explosive_support
+3: aerobic_leg_circle
+4: aerobic_helicopter
+5: aerobic_support
+6: aerobic_v_support
+7: aerobic_horizontal_support
+8: aerobic_straight_jump
+9: aerobic_illusion
+10: aerobic_bent_leg(s)_jump
+11: aerobic_pike_jump
+12: aerobic_straddle_jump
+13: aerobic_split_jump
+14: aerobic_scissors_leap
+15: aerobic_kick_jump
+16: aerobic_off_axis_jump
+17: aerobic_butterfly_jump
+18: aerobic_split
+19: aerobic_turn
+20: aerobic_balance_turn
+21: volleyball_serve
+22: volleyball_block
+23: volleyball_first_pass
+24: volleyball_defend
+25: volleyball_protect
+26: volleyball_second_pass
+27: volleyball_adjust
+28: volleyball_save
+29: volleyball_second_attack
+30: volleyball_spike
+31: volleyball_dink
+32: volleyball_no_offensive_attack
+33: football_shoot
+34: football_long_pass
+35: football_short_pass
+36: football_through_pass
+37: football_cross
+38: football_dribble
+39: football_trap
+40: football_throw
+41: football_diving
+42: football_tackle
+43: football_steal
+44: football_clearance
+45: football_block
+46: football_press
+47: football_aerial_duels
+48: basketball_pass
+49: basketball_drive
+50: basketball_dribble
+51: basketball_3-point_shot
+52: basketball_2-point_shot
+53: basketball_free_throw
+54: basketball_block
+55: basketball_offensive_rebound
+56: basketball_defensive_rebound
+57: basketball_pass_steal
+58: basketball_dribble_steal
+59: basketball_interfere_shot
+60: basketball_pick-and-roll_defensive
+61: basketball_sag
+62: basketball_screen
+63: basketball_pass-inbound
+64: basketball_save
+65: basketball_jump_ball
diff --git a/tools/data/multisports/parse_anno.py b/tools/data/multisports/parse_anno.py
new file mode 100644
index 0000000000..bb8ed66316
--- /dev/null
+++ b/tools/data/multisports/parse_anno.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import csv
+import os
+import os.path as osp
+from argparse import ArgumentParser
+
+import numpy as np
+from mmengine import dump, list_dir_or_file, load
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        '--data-root',
+        default='data/multisports',
+        help='the directory to multisports annotations')
+    parser.add_argument(
+        '--out-root',
+        default='data/multisports',
+        help='output directory of output annotation files')
+    parser.add_argument('--dump-proposals', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def parse_anno(args):
+    if not osp.exists(args.out_root):
+        os.makedirs(osp.join(args.out_root, 'annotations'))
+
+    anno_path = osp.join(args.data_root, 'annotations/multisports_GT.pkl')
+    annos = load(anno_path)
+
+    # convert key in proposal file to filename
+    key2filename = {
+        video.split('/')[1]: video + '.mp4'
+        for video in annos['nframes'].keys()
+    }
+    test_videos = [
+        file for file in list_dir_or_file(
+            osp.join(args.data_root, 'test'), recursive=True)
+        if file.endswith('.mp4')
+    ]
+    key2filename.update(
+        {video.split('/')[1][:-4]: video
+         for video in test_videos})
+    # convert proposal bboxes
+    if args.dump_proposals:
+        proposals_path = osp.join(args.data_root,
+                                  'annotations/MultiSports_box')
+        for proposals in os.listdir(proposals_path):
+            proposal_info = load(osp.join(proposals_path, proposals))
+            proposal_out = dict()
+            for key in proposal_info.keys():
+                key_split = key.split(',')
+                if key_split[0] in key2filename.keys():
+                    new_key = \
+                        f'{key2filename[key_split[0]]},{int(key_split[1]):04d}'
+                proposal_out[new_key] = proposal_info[key]
+            target_path = osp.join(args.out_root, 'annotations',
+                                   'multisports_dense_proposals_' + proposals)
+            dump(proposal_out, target_path)
+    # dump train and val list
+    for split in ['train', 'val']:
+        out_anno_path = osp.join(args.out_root, 'annotations',
+                                 f'multisports_{split}.csv')
+        with open(out_anno_path, 'w') as csv_f:
+            writer = csv.writer(csv_f)
+            if split == 'train':
+                video_list = annos['train_videos'][0]
+            elif split == 'val':
+                video_list = annos['test_videos'][0]
+            gt_tubes = annos['gttubes']
+            resolutions = annos['resolution']
+            for video_id in video_list:
+                vid_tubes = gt_tubes[video_id]
+                h, w = resolutions[video_id]
+                for label, tubes in vid_tubes.items():
+                    entity_id = 0
+                    for tube in tubes:
+                        for frame_anno in tube:
+                            frame_stamp = int(frame_anno[0])
+                            entity_box = frame_anno[1:]
+                            entity_box /= np.array([w, h, w, h])
+                            entity_box = [f'{num:.3f}' for num in entity_box]
+                            filename = video_id + '.mp4'
+                            anno_line = [
+                                filename, frame_stamp, *entity_box, label,
+                                entity_id
+                            ]
+                            writer.writerow(anno_line)
+                        entity_id += 1
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    parse_anno(args)
diff --git a/tools/data/omnisource/trim_raw_video.py b/tools/data/omnisource/trim_raw_video.py
index 81aef77140..6f31103755 100644
--- a/tools/data/omnisource/trim_raw_video.py
+++ b/tools/data/omnisource/trim_raw_video.py
@@ -4,7 +4,7 @@
 import sys
 from subprocess import check_output
 
-import mmcv
+import mmengine
 
 
 def get_duration(vid_name):
@@ -28,7 +28,7 @@ def trim(vid_name):
 
     # We output 10-second clips into the folder `name`
     dest = name
-    mmcv.mkdir_or_exist(dest)
+    mmengine.mkdir_or_exist(dest)
 
     command_tmpl = ('ffmpeg -y loglevel error -i {} -ss {} -t {} -crf 18 '
                     '-c:v libx264 {}/part_{}.mp4')
diff --git a/tools/data/skeleton/README.md b/tools/data/skeleton/README.md
index 10244d23a1..2f55a2021e 100644
--- a/tools/data/skeleton/README.md
+++ b/tools/data/skeleton/README.md
@@ -58,7 +58,7 @@ Each pickle file corresponds to an action recognition dataset. The content of a
 
 ## Visualization
 
-For skeleton data visualization, you need also to prepare the RGB videos. Please refer to [visualize_heatmap_volume](/demo/visualize_heatmap_volume.ipynb) for detailed process. Here we provide some visualization examples from NTU-60 and FineGYM.
+For skeleton data visualization, you need also to prepare the RGB videos. Please refer to \[visualize_heatmap_volume\] for detailed process. Here we provide some visualization examples from NTU-60 and FineGYM.
 
 <table>
 <thead>
diff --git a/tools/data/skeleton/README_zh-CN.md b/tools/data/skeleton/README_zh-CN.md
index 3754175908..2cd354a1d5 100644
--- a/tools/data/skeleton/README_zh-CN.md
+++ b/tools/data/skeleton/README_zh-CN.md
@@ -72,7 +72,7 @@ python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.
 
 ## 可视化
 
-为了可视化骨架数据，用户需要准备 RGB 的视频。详情可参考 [visualize_heatmap_volume](/demo/visualize_heatmap_volume.ipynb)。这里提供一些 NTU-60 和 FineGYM 上的例子
+为了可视化骨架数据，用户需要准备 RGB 的视频。详情可参考 \[visualize_heatmap_volume\]。这里提供一些 NTU-60 和 FineGYM 上的例子
 
 <table>
 <thead>
@@ -139,4 +139,4 @@ MMAction2 提供脚本以将其他第三方项目的骨骼标注转至 MMAction2
 - [x] NTU120_XSet
 - [x] UCF101
 - [x] HMDB51
-- [ ] Kinetics
+- [x] Kinetics
diff --git a/tools/data/skeleton/ntu_pose_extraction.py b/tools/data/skeleton/ntu_pose_extraction.py
index d60fefdd97..d03d2a90a3 100644
--- a/tools/data/skeleton/ntu_pose_extraction.py
+++ b/tools/data/skeleton/ntu_pose_extraction.py
@@ -229,6 +229,31 @@ def ntu_det_postproc(vid, det_results):
         return bboxes2bbox(det_results, len(det_results))
 
 
+def pose_inference_with_align(args, frame_paths, det_results):
+    # filter frame without det bbox
+    det_results = [
+        frm_dets for frm_dets in det_results if frm_dets.shape[0] > 0
+    ]
+
+    pose_results, _ = pose_inference(args.pose_config, args.pose_checkpoint,
+                                     frame_paths, det_results, args.device)
+    # align the num_person among frames
+    num_persons = max([pose['keypoints'].shape[0] for pose in pose_results])
+    num_points = pose_results[0]['keypoints'].shape[1]
+    num_frames = len(pose_results)
+    keypoints = np.zeros((num_persons, num_frames, num_points, 2),
+                         dtype=np.float32)
+    scores = np.zeros((num_persons, num_frames, num_points), dtype=np.float32)
+
+    for f_idx, frm_pose in enumerate(pose_results):
+        frm_num_persons = frm_pose['keypoints'].shape[0]
+        for p_idx in range(frm_num_persons):
+            keypoints[p_idx, f_idx] = frm_pose['keypoints'][p_idx]
+            scores[p_idx, f_idx] = frm_pose['keypoint_scores'][p_idx]
+
+    return keypoints, scores
+
+
 def ntu_pose_extraction(vid, skip_postproc=False):
     tmp_dir = TemporaryDirectory()
     frame_paths, _ = frame_extract(vid, out_dir=tmp_dir.name)
@@ -242,19 +267,17 @@ def ntu_pose_extraction(vid, skip_postproc=False):
 
     if not skip_postproc:
         det_results = ntu_det_postproc(vid, det_results)
-    pose_results, _ = pose_inference(args.pose_config, args.pose_checkpoint,
-                                     frame_paths, det_results, args.device)
 
     anno = dict()
-    anno['keypoint'] = np.stack(
-        [pose['keypoints'].astype(np.float32) for pose in pose_results],
-        axis=1)
-    anno['keypoint_score'] = np.stack(
-        [pose['keypoint_scores'] for pose in pose_results], axis=1)
+
+    keypoints, scores = pose_inference_with_align(args, frame_paths,
+                                                  det_results)
+    anno['keypoint'] = keypoints
+    anno['keypoint_score'] = scores
     anno['frame_dir'] = osp.splitext(osp.basename(vid))[0]
     anno['img_shape'] = (1080, 1920)
     anno['original_shape'] = (1080, 1920)
-    anno['total_frames'] = len(pose_results)
+    anno['total_frames'] = keypoints.shape[1]
     anno['label'] = int(osp.basename(vid).split('A')[1][:3]) - 1
     tmp_dir.cleanup()
 
diff --git a/tools/data/sthv1/README.md b/tools/data/sthv1/README.md
index a441c8bd91..afe35eadc4 100644
--- a/tools/data/sthv1/README.md
+++ b/tools/data/sthv1/README.md
@@ -65,7 +65,7 @@ data = dict(
 
 This part is **optional** if you only want to use RGB frames.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
 
@@ -141,4 +141,4 @@ mmaction2
 
 ```
 
-For training and evaluating on Something-Something V1, please refer to [getting_started.md](/docs/getting_started.md).
+For training and evaluating on Something-Something V1, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/sthv1/README_zh-CN.md b/tools/data/sthv1/README_zh-CN.md
index dc10fa4cc9..9f8ab28a20 100644
--- a/tools/data/sthv1/README_zh-CN.md
+++ b/tools/data/sthv1/README_zh-CN.md
@@ -63,7 +63,7 @@ data = dict(
 
 如果用户只想使用原 RGB 帧加载训练，则该部分是 **可选项**。
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。
 
@@ -139,4 +139,4 @@ mmaction2
 
 ```
 
-关于对 Something-Something V1 进行训练和验证，可以参考 [基础教程](/docs_zh_CN/getting_started.md)。
+关于对 Something-Something V1 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/sthv2/README.md b/tools/data/sthv2/README.md
index c382ce7630..5e05e6ff90 100644
--- a/tools/data/sthv2/README.md
+++ b/tools/data/sthv2/README.md
@@ -16,12 +16,26 @@
 ```
 
 For basic dataset information, you can refer to the dataset [website](https://developer.qualcomm.com/software/ai-datasets/something-something).
-Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/sthv2/`.
+
+`````{tabs}
+
+````{group-tab} Download by MIM
+MIM supports downloading from OpenDataLab and preprocessing Something-Something V2 dataset with one command line.
+```Bash
+# install OpenDataLab CLI tools
+pip install -U opendatalab
+# log in OpenDataLab
+odl login
+# download and preprocess by MIM
+mim download mmaction2 --dataset sthv2
+```
+
+````
 
 ## Step 1. Prepare Annotations
 
 First of all, you have to sign in and download annotations to `$MMACTION2/data/sthv2/annotations` on the official [website](https://20bn.com/datasets/something-something/v2).
-
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/sthv2/`.
 ## Step 2. Prepare Videos
 
 Then, you can download all data parts to `$MMACTION2/data/sthv2/` and use the following command to uncompress.
@@ -36,7 +50,7 @@ cd $MMACTION2/tools/data/sthv2/
 
 This part is **optional** if you only want to use the video loader.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
 
@@ -78,7 +92,10 @@ cd $MMACTION2/tools/data/sthv2/
 bash generate_{rawframes, videos}_filelist.sh
 ```
 
-## Step 5. Check Directory Structure
+````
+`````
+
+### Check Directory Structure
 
 After the whole data process for Something-Something V2 preparation,
 you will get the rawframes (RGB + Flow), videos and annotation files for Something-Something V2.
@@ -92,14 +109,14 @@ mmaction2
 ├── configs
 ├── data
 │   ├── sthv2
-│   │   ├── sthv2_{train,val}_list_rawframes.txt
+│   │   ├── sthv2_{train,val}_list_rawframes.txt(Optional)
 │   │   ├── sthv2_{train,val}_list_videos.txt
-│   │   ├── annotations
+│   │   ├── annotations(Optional)
 │   |   ├── videos
 │   |   |   ├── 1.mp4
 │   |   |   ├── 2.mp4
 │   |   |   ├──...
-│   |   ├── rawframes
+│   |   ├── rawframes(Optional)
 │   |   |   ├── 1
 │   |   |   |   ├── img_00001.jpg
 │   |   |   |   ├── img_00002.jpg
@@ -115,5 +132,4 @@ mmaction2
 
 ```
 
-For training and evaluating on Something-Something V2, please refer to [getting_started.md](/docs/getting_started.md).
-s/getting_started.md).
+For training and evaluating on Something-Something V2, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/sthv2/README_zh-CN.md b/tools/data/sthv2/README_zh-CN.md
index bc48e10a11..0bc5baf3fc 100644
--- a/tools/data/sthv2/README_zh-CN.md
+++ b/tools/data/sthv2/README_zh-CN.md
@@ -16,11 +16,27 @@
 ```
 
 用户可参考该数据集的 [官网](https://developer.qualcomm.com/software/ai-datasets/something-something)，以获取数据集相关的基本信息。
-在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/sthv2/`。
 
+`````{tabs}
+
+````{group-tab} 使用 MIM 下载
+# MIM 支持下载 Something-Something V2 数据集。用户可以通过一行命令，从 OpenDataLab 进行下载，并进行预处理。
+```Bash
+# 安装 OpenDataLab CLI 工具
+pip install -U opendatalab
+# 登录 OpenDataLab
+odl login
+# 通过 MIM 进行数据集下载，预处理。注意这将花费较长时间
+mim download mmaction2 --dataset sthv2
+```
+
+````
+
+````{group-tab} 从官方源下载
 ## 步骤 1. 下载标注文件
 
 首先，用户需要在 [官网](https://developer.qualcomm.com/software/ai-datasets/something-something) 完成注册，才能下载标注文件。下载好的标注文件需要放在 `$MMACTION2/data/sthv2/annotations` 文件夹下。
+用户可以使用以下命令下载标注文件。在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/sthv2/`。
 
 ## 步骤 2. 准备视频
 
@@ -36,7 +52,7 @@ cd $MMACTION2/tools/data/sthv2/
 
 如果用户只想使用视频加载训练，则该部分是 **可选项**。
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。
 
@@ -78,7 +94,10 @@ cd $MMACTION2/tools/data/sthv2/
 bash generate_{rawframes, videos}_filelist.sh
 ```
 
-## 步骤 5. 检查文件夹结构
+````
+`````
+
+### 检查文件夹结构
 
 在完成所有 Something-Something V2 数据集准备流程后，
 用户可以获得对应的 RGB + 光流文件，视频文件以及标注文件。
@@ -92,14 +111,14 @@ mmaction2
 ├── configs
 ├── data
 │   ├── sthv2
-│   │   ├── sthv2_{train,val}_list_rawframes.txt
+│   │   ├── sthv2_{train,val}_list_rawframes.txt（可选）
 │   │   ├── sthv2_{train,val}_list_videos.txt
-│   │   ├── annotations
+│   │   ├── annotations（可选）
 │   |   ├── videos
 │   |   |   ├── 1.mp4
 │   |   |   ├── 2.mp4
 │   |   |   ├──...
-│   |   ├── rawframes
+│   |   ├── rawframes（可选）
 │   |   |   ├── 1
 │   |   |   |   ├── img_00001.jpg
 │   |   |   |   ├── img_00002.jpg
@@ -115,4 +134,4 @@ mmaction2
 
 ```
 
-关于对 Something-Something V2 进行训练和验证，可以参考 [基础教程](/docs_zh_CN/getting_started.md)。
+关于对 Something-Something V2 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/sthv2/preprocss.sh b/tools/data/sthv2/preprocss.sh
new file mode 100644
index 0000000000..440a3d42ba
--- /dev/null
+++ b/tools/data/sthv2/preprocss.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+cat $DOWNLOAD_DIR/sthv2/raw/*.tar.gz  | tar -xvz -C $(dirname $DATA_ROOT)
+tar -xvf $DATA_ROOT/sthv2.tar -C $(dirname $DATA_ROOT)
+rm $DATA_ROOT/sthv2.tar
diff --git a/tools/data/thumos14/README.md b/tools/data/thumos14/README.md
index eaddb60cbe..f91ee1551c 100644
--- a/tools/data/thumos14/README.md
+++ b/tools/data/thumos14/README.md
@@ -40,7 +40,7 @@ bash download_videos.sh
 
 This part is **optional** if you only want to use the video loader.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance.
 
@@ -139,4 +139,4 @@ mmaction2
 │   │   │   |   ├── video_test_0000001
 ```
 
-For training and evaluating on THUMOS'14, please refer to [getting_started.md](/docs/getting_started.md).
+For training and evaluating on THUMOS'14, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/thumos14/README_zh-CN.md b/tools/data/thumos14/README_zh-CN.md
index fb7140a24e..1589cac14b 100644
--- a/tools/data/thumos14/README_zh-CN.md
+++ b/tools/data/thumos14/README_zh-CN.md
@@ -40,7 +40,7 @@ bash download_videos.sh
 
 如果用户只想使用视频加载训练，则该部分是 **可选项**。
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果用户有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 上。
 用户可使用以下命令为 SSD 建立软链接。
@@ -136,4 +136,4 @@ mmaction2
 │   │   │   |   ├── video_test_0000001
 ```
 
-关于对 THUMOS'14 进行训练和验证，可以参照 [基础教程](/docs_zh_CN/getting_started.md)。
+关于对 THUMOS'14 进行训练和验证，可以参照 [训练教程](/docs/zh_cn/user_guides/train_test.md)。
diff --git a/tools/data/ucf101/README.md b/tools/data/ucf101/README.md
index 9abaff1b90..065ec8513d 100644
--- a/tools/data/ucf101/README.md
+++ b/tools/data/ucf101/README.md
@@ -43,7 +43,7 @@ python ../resize_videos.py ../../../data/ucf101/videos/ ../../../data/ucf101/vid
 
 This part is **optional** if you only want to use the video loader.
 
-Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
+Before extracting, please refer to [install.md](/docs/en/get_started/installation.md) for installing [denseflow](https://github.com/open-mmlab/denseflow).
 
 If you have plenty of SSD space, then we recommend extracting frames there for better I/O performance. The extracted frames (RGB + Flow) will take up about 100GB.
 
@@ -124,4 +124,4 @@ mmaction2
 
 ```
 
-For training and evaluating on UCF-101, please refer to [getting_started.md](/docs/getting_started.md).
+For training and evaluating on UCF-101, please refer to [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
diff --git a/tools/data/ucf101/README_zh-CN.md b/tools/data/ucf101/README_zh-CN.md
index 96e9453ff4..af07fbf104 100644
--- a/tools/data/ucf101/README_zh-CN.md
+++ b/tools/data/ucf101/README_zh-CN.md
@@ -41,7 +41,7 @@ python ../resize_videos.py ../../../data/ucf101/videos/ ../../../data/ucf101/vid
 
 如果用户只想使用视频加载训练，则该部分是 **可选项**。
 
-在抽取视频帧和光流之前，请参考 [安装指南](/docs_zh_CN/install.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
+在抽取视频帧和光流之前，请参考 [安装指南](/docs/zh_cn/get_started/installation.md) 安装 [denseflow](https://github.com/open-mmlab/denseflow)。
 
 如果拥有大量的 SSD 存储空间，则推荐将抽取的帧存储至 I/O 性能更优秀的 SSD 中。所抽取的视频帧和光流约占据 100 GB 的存储空间。
 
@@ -122,4 +122,4 @@ mmaction2
 
 ```
 
-关于对 UCF-101 进行训练和验证，可以参考 [基础教程](/docs_zh_CN/getting_started.md)。
+关于对 UCF-101 进行训练和验证，请参考 [训练和测试教程](/docs/en/user_guides/train_test.md)。
diff --git a/tools/data/video_retrieval/README.md b/tools/data/video_retrieval/README.md
new file mode 100644
index 0000000000..77f05ddcf7
--- /dev/null
+++ b/tools/data/video_retrieval/README.md
@@ -0,0 +1,45 @@
+# Preparing Video Retrieval Datasets
+
+## Introduction
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{xu2016msr,
+      title={Msr-vtt: A large video description dataset for bridging video and language},
+      author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong},
+      booktitle={CVPR},
+      pages={5288--5296},
+      year={2016}
+}
+```
+
+Before we start, please make sure that the directory is located at `$MMACTION2/tools/data/video_retrieval/`.
+
+## Preparing MSRVTT dataset
+
+For basic dataset information, you can refer to the MSRVTT dataset [website](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/). Run the following command to prepare the MSRVTT dataset:
+
+```shell
+bash prepare_msrvtt.sh
+```
+
+After preparation, the folder structure will look like:
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── video_retrieval
+│   │   └── msrvtt
+│   │       ├── train_9k.json
+│   │       ├── train_7k.json
+│   │       ├── test_JSFUSION.json
+│   │       └─── videos
+│   │           ├── video0.mp4
+│   │           ├── video1.mp4
+│   │           ├── ...
+│   │           └── video9999.mp4
+```
diff --git a/tools/data/video_retrieval/README_zh-CN.md b/tools/data/video_retrieval/README_zh-CN.md
new file mode 100644
index 0000000000..a4cd194f58
--- /dev/null
+++ b/tools/data/video_retrieval/README_zh-CN.md
@@ -0,0 +1,45 @@
+# 准备视频检索数据集
+
+## 简介
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{xu2016msr,
+      title={Msr-vtt: A large video description dataset for bridging video and language},
+      author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong},
+      booktitle={CVPR},
+      pages={5288--5296},
+      year={2016}
+}
+```
+
+在数据集准备前，请确保命令行当前路径为 `$MMACTION2/tools/data/video_retrieval/`。
+
+## 准备 MSRVTT 数据集
+
+用户可参考该数据集的[官网](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)，以获取数据集相关的基本信息。运行下面的命令准备 MSRVTT 数据集：
+
+```shell
+bash prepare_msrvtt.sh
+```
+
+完场上述准备步骤后，文件目录如下：
+
+```
+mmaction2
+├── mmaction
+├── tools
+├── configs
+├── data
+│   ├── video_retrieval
+│   │   └── msrvtt
+│   │       ├── train_9k.json
+│   │       ├── train_7k.json
+│   │       ├── test_JSFUSION.json
+│   │       └─── videos
+│   │           ├── video0.mp4
+│   │           ├── video1.mp4
+│   │           ├── ...
+│   │           └── video9999.mp4
+```
diff --git a/tools/data/video_retrieval/prepare_msrvtt.py b/tools/data/video_retrieval/prepare_msrvtt.py
new file mode 100644
index 0000000000..bf7ca3c091
--- /dev/null
+++ b/tools/data/video_retrieval/prepare_msrvtt.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+import pandas as pd
+
+DATA_DIR = '../../../data/video_retrieval/msrvtt'
+SUFFIX = '.mp4'
+
+raw_data_path = osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_data.json')
+train_csv_path = [
+    osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_train.9k.csv'),
+    osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_train.7k.csv')
+]
+test_csv_path = osp.join(DATA_DIR, 'msrvtt_data/MSRVTT_JSFUSION_test.csv')
+train_json_path = [
+    osp.join(DATA_DIR, 'train_9k.json'),
+    osp.join(DATA_DIR, 'train_7k.json')
+]
+test_json_path = osp.join(DATA_DIR, 'test_JSFUSION.json')
+
+with open(raw_data_path, 'r') as f:
+    data = json.load(f)
+
+sentences = data['sentences']
+video_dict = {}
+for sentence in sentences:
+    caption = sentence['caption']
+    video_id = sentence['video_id']
+    if video_id not in video_dict:
+        video_dict[video_id] = []
+    video_dict[video_id].append(caption)
+
+for ip, op in zip(train_csv_path, train_json_path):
+    train_csv = pd.read_csv(ip)
+    train_video_ids = list(train_csv['video_id'].values)
+    train_video_dict = {}
+    for video_id in train_video_ids:
+        train_video_dict[video_id + SUFFIX] = video_dict[video_id]
+
+    with open(op, 'w') as f:
+        json.dump(train_video_dict, f)
+
+test_data = pd.read_csv(test_csv_path)
+
+test_video_dict = {}
+for video_id, sentence in zip(test_data['video_id'], test_data['sentence']):
+    test_video_dict[video_id + SUFFIX] = [sentence]
+
+with open(test_json_path, 'w') as f:
+    json.dump(test_video_dict, f)
diff --git a/tools/data/video_retrieval/prepare_msrvtt.sh b/tools/data/video_retrieval/prepare_msrvtt.sh
new file mode 100644
index 0000000000..13cba357fc
--- /dev/null
+++ b/tools/data/video_retrieval/prepare_msrvtt.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+DATA_DIR="../../../data/video_retrieval/msrvtt"
+mkdir -p ${DATA_DIR}
+
+
+if [ -f "msrvtt_data.zip" ]; then
+    echo "msrvtt_data.zip exists, skip downloading!"
+else
+    echo "Downloading msrvtt_data.zip."
+    wget https://github.com/ArrowLuo/CLIP4Clip/releases/download/v0.0/msrvtt_data.zip
+fi
+
+echo "Processing annotations started."
+unzip -q msrvtt_data.zip -d ${DATA_DIR}
+python prepare_msrvtt.py
+echo "Processing annotations completed."
+
+if [ -f "MSRVTT.zip" ]; then
+    echo "MSRVTT.zip exists, skip downloading!"
+else
+    echo "Downloading MSRVTT.zip."
+    wget https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip
+fi
+
+echo "Processing videos started."
+unzip -q MSRVTT.zip -d ${DATA_DIR}
+mkdir -p "${DATA_DIR}/videos/" && find "${DATA_DIR}/MSRVTT/videos/all" -name "video*.mp4" -exec mv {} "${DATA_DIR}/videos/" \;
+echo "Processing videos completed."
+
+rm -rf "${DATA_DIR}/MSRVTT"
+rm -rf "${DATA_DIR}/msrvtt_data"
+rm msrvtt_data.zip
+rm MSRVTT.zip
+echo "The preparation of the msrvtt dataset has been successfully completed."
diff --git a/tools/deployment/export_onnx_gcn.py b/tools/deployment/export_onnx_gcn.py
new file mode 100644
index 0000000000..a4fd237a59
--- /dev/null
+++ b/tools/deployment/export_onnx_gcn.py
@@ -0,0 +1,164 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This script serves the sole purpose of converting skeleton-based graph
+# in MMAction2 to ONNX files. Please note that attempting to convert other
+# models using this script may not yield successful results.
+import argparse
+
+import numpy as np
+import onnxruntime
+import torch
+import torch.nn as nn
+from mmengine import Config
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.structures import LabelData
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Get model flops and params')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--num_frames', type=int, default=150, help='number of input frames.')
+    parser.add_argument(
+        '--num_person', type=int, default=2, help='number of maximum person.')
+    parser.add_argument(
+        '--num_joints',
+        type=int,
+        default=0,
+        help='number of joints. If not given, will use default settings from'
+        'the config file')
+    parser.add_argument(
+        '--device', type=str, default='cpu', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--output_file',
+        type=str,
+        default='stgcn.onnx',
+        help='file name of the output onnx file')
+    args = parser.parse_args()
+    return args
+
+
+class AvgPool2d(nn.Module):
+
+    def forward(self, x):
+        return x.mean(dim=(-1, -2), keepdims=True)
+
+
+class MaxPool2d(nn.Module):
+
+    def forward(self, x):
+        x = x.max(dim=-1, keepdim=True)[0]
+        x = x.max(dim=-2, keepdim=True)[0]
+        return x
+
+
+class GCNNet(nn.Module):
+
+    def __init__(self, base_model):
+        super(GCNNet, self).__init__()
+        self.backbone = base_model.backbone
+        self.head = base_model.cls_head
+
+        if hasattr(self.head, 'pool'):
+            pool = self.head.pool
+            if isinstance(pool, nn.AdaptiveAvgPool2d):
+                assert pool.output_size == 1
+                self.head.pool = AvgPool2d()
+            elif isinstance(pool, nn.AdaptiveMaxPool2d):
+                assert pool.output_size == 1
+                self.head.pool = MaxPool2d()
+
+    def forward(self, input_tensor):
+        feat = self.backbone(input_tensor)
+        cls_score = self.head(feat)
+        return cls_score
+
+
+def softmax(x):
+    x = np.exp(x - x.max())
+    return x / x.sum()
+
+
+def main():
+    args = parse_args()
+    config = Config.fromfile(args.config)
+    init_default_scope(config.get('default_scope', 'mmaction'))
+
+    if config.model.type != 'RecognizerGCN':
+        print(
+            'This script serves the sole purpose of converting skeleton-based '
+            'graph in MMAction2 to ONNX files. Please note that attempting to '
+            'convert other models using this script may not yield successful '
+            'results.\n\n')
+
+    base_model = MODELS.build(config.model)
+    load_checkpoint(base_model, args.checkpoint, map_location='cpu')
+    base_model.to(args.device)
+
+    lookup = {'openpose': 18, 'nturgb+d': 25, 'coco': 17}
+
+    num_joints = args.num_joints
+    num_person = args.num_person
+    num_frames = args.num_frames
+    if num_joints == 0:
+        layout = config.model.backbone.graph_cfg.layout
+        if layout not in lookup:
+            raise KeyError(
+                '`layout` not supported, please specify `num_joints`')
+        num_joints = lookup[layout]
+
+    input_tensor = torch.randn(1, num_person, num_frames, num_joints, 3)
+    input_tensor = input_tensor.clamp(-3, 3).to(args.device)
+
+    base_model.eval()
+
+    data_sample = ActionDataSample()
+    data_sample.pred_scores = LabelData()
+    data_sample.pred_labels = LabelData()
+    base_output = base_model(
+        input_tensor.unsqueeze(0), data_samples=[data_sample],
+        mode='predict')[0]
+    base_output = base_output.pred_scores.item.detach().cpu().numpy()
+
+    model = GCNNet(base_model).to(args.device)
+    model.eval()
+
+    torch.onnx.export(
+        model, (input_tensor),
+        args.output_file,
+        input_names=['input_tensor'],
+        output_names=['cls_score'],
+        export_params=True,
+        do_constant_folding=True,
+        verbose=False,
+        opset_version=12,
+        dynamic_axes={
+            'input_tensor': {
+                0: 'batch_size',
+                1: 'num_person',
+                2: 'num_frames'
+            },
+            'cls_score': {
+                0: 'batch_size'
+            }
+        })
+
+    print(f'Successfully export the onnx file to {args.output_file}')
+
+    # Test exported file
+    session = onnxruntime.InferenceSession(args.output_file)
+    input_feed = {'input_tensor': input_tensor.cpu().data.numpy()}
+    outputs = session.run(['cls_score'], input_feed=input_feed)
+    output = softmax(outputs[0][0])
+
+    diff = abs(base_output - output).max()
+    if diff < 1e-5:
+        print('The output difference is smaller than 1e-5.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/deployment/export_onnx_posec3d.py b/tools/deployment/export_onnx_posec3d.py
new file mode 100644
index 0000000000..014096b48e
--- /dev/null
+++ b/tools/deployment/export_onnx_posec3d.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This script serves the sole purpose of converting PoseC3D skeleton models
+# in MMAction2 to ONNX files. Please note that attempting to convert other
+# models using this script may not yield successful results.
+import argparse
+
+import numpy as np
+import onnxruntime
+import torch
+import torch.nn as nn
+from mmengine import Config
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.structures import LabelData
+
+from mmaction.registry import MODELS
+from mmaction.structures import ActionDataSample
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Get model flops and params')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--num_frames', type=int, default=48, help='number of input frames.')
+    parser.add_argument(
+        '--image_size', type=int, default=64, help='size of the frame')
+    parser.add_argument(
+        '--num_joints',
+        type=int,
+        default=0,
+        help='number of joints. If not given, will use default settings from'
+        'the config file')
+    parser.add_argument(
+        '--device', type=str, default='cpu', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--output_file',
+        type=str,
+        default='posec3d.onnx',
+        help='file name of the output onnx file')
+    args = parser.parse_args()
+    return args
+
+
+class AvgPool3d(nn.Module):
+
+    def forward(self, x):
+        return x.mean(dim=(-1, -2, -3), keepdims=True)
+
+
+class MaxPool3d(nn.Module):
+
+    def forward(self, x):
+        x = x.max(dim=-1, keepdim=True)[0]
+        x = x.max(dim=-2, keepdim=True)[0]
+        x = x.max(dim=-3, keepdim=True)[0]
+        return x
+
+
+class GCNNet(nn.Module):
+
+    def __init__(self, base_model):
+        super(GCNNet, self).__init__()
+        self.backbone = base_model.backbone
+        self.head = base_model.cls_head
+
+        if hasattr(self.head, 'pool'):
+            pool = self.head.pool
+            if isinstance(pool, nn.AdaptiveAvgPool3d):
+                assert pool.output_size == 1
+                self.head.pool = AvgPool3d()
+            elif isinstance(pool, nn.AdaptiveMaxPool3d):
+                assert pool.output_size == 1
+                self.head.pool = MaxPool3d()
+
+    def forward(self, input_tensor):
+        feat = self.backbone(input_tensor)
+        cls_score = self.head(feat)
+        return cls_score
+
+
+def softmax(x):
+    x = np.exp(x - x.max())
+    return x / x.sum()
+
+
+def main():
+    args = parse_args()
+    config = Config.fromfile(args.config)
+
+    if config.model.type != 'RecognizerGCN':
+        print('This script serves the sole purpose of converting PoseC3D '
+              'skeleton models in MMAction2 to ONNX files. Please note that '
+              'attempting to convert other models using this script may not '
+              'yield successful results.\n\n')
+
+    init_default_scope(config.get('default_scope', 'mmaction'))
+
+    base_model = MODELS.build(config.model)
+    load_checkpoint(base_model, args.checkpoint, map_location='cpu')
+    base_model.to(args.device)
+
+    num_joints = args.num_joints
+    image_size = args.image_size
+    num_frames = args.num_frames
+    if num_joints == 0:
+        num_joints = config.model.backbone.in_channels
+
+    input_tensor = torch.randn(1, num_joints, num_frames, image_size,
+                               image_size)
+    input_tensor = input_tensor.clamp(-3, 3).to(args.device)
+
+    base_model.eval()
+
+    data_sample = ActionDataSample()
+    data_sample.pred_scores = LabelData()
+    data_sample.pred_labels = LabelData()
+    base_output = base_model(
+        input_tensor.unsqueeze(0), data_samples=[data_sample],
+        mode='predict')[0]
+    base_output = base_output.pred_scores.item.detach().cpu().numpy()
+
+    model = GCNNet(base_model).to(args.device)
+    model.eval()
+
+    torch.onnx.export(
+        model, (input_tensor),
+        args.output_file,
+        input_names=['input_tensor'],
+        output_names=['cls_score'],
+        export_params=True,
+        do_constant_folding=True,
+        verbose=False,
+        opset_version=11,
+        dynamic_axes={
+            'input_tensor': {
+                0: 'batch_size',
+                2: 'num_frames'
+            },
+            'cls_score': {
+                0: 'batch_size'
+            }
+        })
+
+    print(f'Successfully export the onnx file to {args.output_file}')
+
+    # Test exported file
+    session = onnxruntime.InferenceSession(args.output_file)
+    input_feed = {'input_tensor': input_tensor.cpu().data.numpy()}
+    outputs = session.run(['cls_score'], input_feed=input_feed)
+    output = softmax(outputs[0][0])
+
+    diff = abs(base_output - output).max()
+    if diff < 1e-5:
+        print('The output difference is smaller than 1e-5.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/deployment/export_onnx_stdet.py b/tools/deployment/export_onnx_stdet.py
index ba0cd2e388..31dd4473b8 100644
--- a/tools/deployment/export_onnx_stdet.py
+++ b/tools/deployment/export_onnx_stdet.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+# This script serves the sole purpose of converting spatial-temporal detection
+# models supported in MMAction2 to ONNX files. Please note that attempting to
+# convert other models using this script may not yield successful results.
 import argparse
 
 import onnxruntime
@@ -125,6 +128,13 @@ def forward(self, input_tensor, rois):
 def main():
     args = parse_args()
     config = Config.fromfile(args.config)
+
+    if config.model.type != 'FastRCNN':
+        print('This script serves the sole purpose of converting spatial '
+              'temporal detection models in MMAction2 to ONNX files. Please '
+              'note that attempting to convert other models using this script '
+              'may not yield successful results.\n\n')
+
     init_default_scope(config.get('default_scope', 'mmaction'))
 
     base_model = MODELS.build(config.model)
diff --git a/tools/misc/clip_feature_extraction.py b/tools/misc/clip_feature_extraction.py
index a7a3e67635..332f60a961 100644
--- a/tools/misc/clip_feature_extraction.py
+++ b/tools/misc/clip_feature_extraction.py
@@ -2,230 +2,264 @@
 import argparse
 import os
 import os.path as osp
-import warnings
-from datetime import datetime
-
-import mmcv
-import numpy as np
-import torch
-import torch.distributed as dist
-from mmcv import Config, DictAction
-from mmcv.cnn import fuse_conv_bn
-from mmcv.fileio.io import file_handlers
-from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from mmcv.runner import get_dist_info, init_dist, load_checkpoint
-from mmcv.runner.fp16_utils import wrap_fp16_model
-
-from mmaction.apis import multi_gpu_test, single_gpu_test
-from mmaction.datasets import build_dataloader, build_dataset
-from mmaction.models import build_model
-from mmaction.utils import register_module_hooks
+
+from mmengine import dump, list_from_file, load
+from mmengine.config import Config, DictAction
+from mmengine.runner import Runner
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        description='MMAction2 clip-level feature extraction')
+        description='MMAction2 feature extraction')
     parser.add_argument('config', help='test config file path')
     parser.add_argument('checkpoint', help='checkpoint file')
-    parser.add_argument('--video-list', help='video file list')
-    parser.add_argument('--video-root', help='video root directory')
+    parser.add_argument('output_prefix', type=str, help='output prefix')
     parser.add_argument(
-        '--out',
-        default=None,
-        help='output result file in pkl/yaml/json format')
+        '--video-list', type=str, default=None, help='video file list')
+    parser.add_argument(
+        '--video-root', type=str, default=None, help='video root directory')
+    parser.add_argument(
+        '--spatial-type',
+        type=str,
+        default='avg',
+        choices=['avg', 'max', 'keep'],
+        help='Pooling type in spatial dimension')
     parser.add_argument(
-        '--fuse-conv-bn',
+        '--temporal-type',
+        type=str,
+        default='avg',
+        choices=['avg', 'max', 'keep'],
+        help='Pooling type in temporal dimension')
+    parser.add_argument(
+        '--long-video-mode',
         action='store_true',
-        help='Whether to fuse conv and bn, this will slightly increase'
-        'the inference speed')
+        help='Perform long video inference to get a feature list from a video')
+    parser.add_argument(
+        '--clip-interval',
+        type=int,
+        default=None,
+        help='Clip interval for Clip interval of adjacent center of sampled '
+        'clips, used for long video inference')
     parser.add_argument(
-        '--gpu-collect',
+        '--frame-interval',
+        type=int,
+        default=None,
+        help='Temporal interval of adjacent sampled frames, used for long '
+        'video long video inference')
+    parser.add_argument(
+        '--multi-view',
         action='store_true',
-        help='whether to use gpu to collect results')
+        help='Perform multi view inference')
     parser.add_argument(
-        '--tmpdir',
-        help='tmp directory used for collecting results from multiple '
-        'workers, available when gpu-collect is not specified')
+        '--dump-score',
+        action='store_true',
+        help='Dump predict scores rather than features')
     parser.add_argument(
         '--cfg-options',
         nargs='+',
         action=DictAction,
-        default={},
         help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file. For example, '
-        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
     parser.add_argument(
         '--launcher',
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
-    # will pass the `--local-rank` parameter to `tools/train.py` instead
-    # of `--local_rank`.
     parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
-
     return args
 
 
-def turn_off_pretrained(cfg):
-    # recursively find all pretrained in the model config,
-    # and set them None to avoid redundant pretrain steps for testing
-    if 'pretrained' in cfg:
-        cfg.pretrained = None
-
-    # recursively turn off pretrained value
-    for sub_cfg in cfg.values():
-        if isinstance(sub_cfg, dict):
-            turn_off_pretrained(sub_cfg)
-
+def merge_args(cfg, args):
+    """Merge CLI arguments to config."""
+    test_pipeline = cfg.test_dataloader.dataset.pipeline
+    # -------------------- Feature Head --------------------
+    if not args.dump_score:
+        backbone_type2name = dict(
+            ResNet3dSlowFast='slowfast',
+            MobileNetV2TSM='tsm',
+            ResNetTSM='tsm',
+        )
 
-def text2tensor(text, size=256):
-    nums = [ord(x) for x in text]
-    assert len(nums) < size
-    nums.extend([0] * (size - len(nums)))
-    nums = np.array(nums, dtype=np.uint8)
-    return torch.from_numpy(nums)
-
-
-def tensor2text(tensor):
-    # 0 may not occur in a string
-    chars = [chr(x) for x in tensor if x != 0]
-    return ''.join(chars)
-
-
-def inference_pytorch(args, cfg, distributed, data_loader):
-    """Get predictions by pytorch models."""
-    # remove redundant pretrain steps for testing
-    turn_off_pretrained(cfg.model)
-
-    # build the model and load checkpoint
-    model = build_model(
-        cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
-
-    if len(cfg.module_hooks) > 0:
-        register_module_hooks(model, cfg.module_hooks)
-
-    fp16_cfg = cfg.get('fp16', None)
-    if fp16_cfg is not None:
-        wrap_fp16_model(model)
-    load_checkpoint(model, args.checkpoint, map_location='cpu')
-
-    if args.fuse_conv_bn:
-        model = fuse_conv_bn(model)
-
-    if not distributed:
-        model = MMDataParallel(model, device_ids=[0])
-        outputs = single_gpu_test(model, data_loader)
-    else:
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False)
-        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
-                                 args.gpu_collect)
-
-    return outputs
+        if cfg.model.type == 'RecognizerGCN':
+            backbone_name = 'gcn'
+        else:
+            backbone_name = backbone_type2name.get(cfg.model.backbone.type)
+        num_segments = None
+        if backbone_name == 'tsm':
+            for idx, transform in enumerate(test_pipeline):
+                if transform.type == 'UntrimmedSampleFrames':
+                    clip_len = transform['clip_len']
+                    continue
+                elif transform.type == 'SampleFrames':
+                    clip_len = transform['num_clips']
+            num_segments = cfg.model.backbone.get('num_segments', 8)
+            assert num_segments == clip_len, \
+                f'num_segments and clip length must same for TSM, but got ' \
+                f'num_segments {num_segments} clip_len {clip_len}'
+            if cfg.model.test_cfg is not None:
+                max_testing_views = cfg.model.test_cfg.get(
+                    'max_testing_views', num_segments)
+                assert max_testing_views % num_segments == 0, \
+                    'tsm needs to infer with batchsize of multiple ' \
+                    'of num_segments.'
+
+        spatial_type = None if args.spatial_type == 'keep' else \
+            args.spatial_type
+        temporal_type = None if args.temporal_type == 'keep' else \
+            args.temporal_type
+        feature_head = dict(
+            type='FeatureHead',
+            spatial_type=spatial_type,
+            temporal_type=temporal_type,
+            backbone_name=backbone_name,
+            num_segments=num_segments)
+        cfg.model.cls_head = feature_head
+
+    # ---------------------- multiple view ----------------------
+    if not args.multi_view:
+        # average features among multiple views
+        cfg.model.cls_head['average_clips'] = 'score'
+        if cfg.model.type == 'Recognizer3D':
+            for idx, transform in enumerate(test_pipeline):
+                if transform.type == 'SampleFrames':
+                    test_pipeline[idx]['num_clips'] = 1
+        for idx, transform in enumerate(test_pipeline):
+            if transform.type == 'SampleFrames':
+                test_pipeline[idx]['twice_sample'] = False
+            # if transform.type in ['ThreeCrop', 'TenCrop']:
+            if transform.type == 'TenCrop':
+                test_pipeline[idx].type = 'CenterCrop'
+
+    # -------------------- pipeline settings  --------------------
+    # assign video list and video root
+    if args.video_list is not None:
+        cfg.test_dataloader.dataset.ann_file = args.video_list
+    if args.video_root is not None:
+        if cfg.test_dataloader.dataset.type == 'VideoDataset':
+            cfg.test_dataloader.dataset.data_prefix = dict(
+                video=args.video_root)
+        elif cfg.test_dataloader.dataset.type == 'RawframeDataset':
+            cfg.test_dataloader.dataset.data_prefix = dict(img=args.video_root)
+    args.video_list = cfg.test_dataloader.dataset.ann_file
+    args.video_root = cfg.test_dataloader.dataset.data_prefix
+    # use UntrimmedSampleFrames for long video inference
+    if args.long_video_mode:
+        # preserve features of multiple clips
+        cfg.model.cls_head['average_clips'] = None
+        cfg.test_dataloader.batch_size = 1
+        is_recognizer2d = (cfg.model.type == 'Recognizer2D')
+
+        frame_interval = args.frame_interval
+        for idx, transform in enumerate(test_pipeline):
+            if transform.type == 'UntrimmedSampleFrames':
+                clip_len = transform['clip_len']
+                continue
+            # replace SampleFrame by UntrimmedSampleFrames
+            elif transform.type in ['SampleFrames', 'UniformSample']:
+                assert args.clip_interval is not None, \
+                    'please specify clip interval for long video inference'
+                if is_recognizer2d:
+                    # clip_len of UntrimmedSampleFrames is same as
+                    # num_clips for 2D Recognizer.
+                    clip_len = transform['num_clips']
+                else:
+                    clip_len = transform['clip_len']
+                    if frame_interval is None:
+                        # take frame_interval of SampleFrames as default
+                        frame_interval = transform.get('frame_interval')
+                assert frame_interval is not None, \
+                    'please specify frame interval for long video ' \
+                    'inference when use UniformSample or 2D Recognizer'
+
+                sample_cfgs = dict(
+                    type='UntrimmedSampleFrames',
+                    clip_len=clip_len,
+                    clip_interval=args.clip_interval,
+                    frame_interval=frame_interval)
+                test_pipeline[idx] = sample_cfgs
+                continue
+        # flow input will stack all frames
+        if cfg.test_dataloader.dataset.get('modality') == 'Flow':
+            clip_len = 1
+
+        if is_recognizer2d:
+            from mmaction.models import ActionDataPreprocessor
+            from mmaction.registry import MODELS
+
+            @MODELS.register_module()
+            class LongVideoDataPreprocessor(ActionDataPreprocessor):
+                """DataPreprocessor for 2D recognizer to infer on long video.
+
+                Which would stack the num_clips to batch dimension, to preserve
+                feature of each clip (no average among clips)
+                """
+
+                def __init__(self, num_frames=8, **kwargs) -> None:
+                    super().__init__(**kwargs)
+                    self.num_frames = num_frames
+
+                def preprocess(self, inputs, data_samples, training=False):
+                    batch_inputs, data_samples = super().preprocess(
+                        inputs, data_samples, training)
+                    # [N*M, T, C, H, W]
+                    nclip_batch_inputs = batch_inputs.view(
+                        (-1, self.num_frames) + batch_inputs.shape[2:])
+                    # data_samples = data_samples * \
+                    #     nclip_batch_inputs.shape[0]
+                    return nclip_batch_inputs, data_samples
+
+            preprocessor_cfg = cfg.model.data_preprocessor
+            preprocessor_cfg.type = 'LongVideoDataPreprocessor'
+            preprocessor_cfg['num_frames'] = clip_len
+
+    # -------------------- Dump predictions --------------------
+    args.dump = osp.join(args.output_prefix, 'total_feats.pkl')
+    dump_metric = dict(type='DumpResults', out_file_path=args.dump)
+    cfg.test_evaluator = [dump_metric]
+    cfg.work_dir = osp.join(args.output_prefix, 'work_dir')
+
+    return cfg
+
+
+def split_feats(args):
+    total_feats = load(args.dump)
+    if args.dump_score:
+        total_feats = [sample['pred_scores']['item'] for sample in total_feats]
+
+    video_list = list_from_file(args.video_list)
+    video_list = [line.split(' ')[0] for line in video_list]
+
+    for video_name, feature in zip(video_list, total_feats):
+        dump(feature, osp.join(args.output_prefix, video_name + '.pkl'))
+    os.remove(args.dump)
 
 
 def main():
     args = parse_args()
 
+    # load config
     cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    cfg = merge_args(cfg, args)
+    cfg.launcher = args.launcher
 
-    cfg.merge_from_dict(args.cfg_options)
-
-    if cfg.model['test_cfg'] is None:
-        cfg.model['test_cfg'] = dict(feature_extraction=True)
-    else:
-        cfg.model['test_cfg']['feature_extraction'] = True
+    cfg.load_from = args.checkpoint
 
-    # Load output_config from cfg
-    output_config = cfg.get('output_config', {})
-    if args.out:
-        # Overwrite output_config from args.out
-        output_config = Config._merge_a_into_b(
-            dict(out=args.out), output_config)
+    # build the runner from config
+    runner = Runner.from_cfg(cfg)
 
-    assert output_config, 'Please specify output filename with --out.'
+    # start testing
+    runner.test()
 
-    dataset_type = cfg.data.test.type
-    if output_config.get('out', None):
-        if 'output_format' in output_config:
-            # ugly workround to make recognition and localization the same
-            warnings.warn(
-                'Skip checking `output_format` in localization task.')
-        else:
-            out = output_config['out']
-            # make sure the dirname of the output path exists
-            mmcv.mkdir_or_exist(osp.dirname(out))
-            _, suffix = osp.splitext(out)
-            assert dataset_type == 'VideoDataset'
-
-            assert suffix[1:] in file_handlers, (
-                'The format of the output '
-                'file should be json, pickle or yaml')
-
-    # set cudnn benchmark
-    if cfg.get('cudnn_benchmark', False):
-        torch.backends.cudnn.benchmark = True
-    cfg.data.test.test_mode = True
-    cfg.data.test.data_prefix = args.video_root
-
-    # init distributed env first, since logger depends on the dist info.
-    if args.launcher == 'none':
-        distributed = False
-    else:
-        distributed = True
-        init_dist(args.launcher, **cfg.dist_params)
-
-    rank, _ = get_dist_info()
-
-    size = 256
-    fname_tensor = torch.zeros(size, dtype=torch.uint8).cuda()
-    if rank == 0:
-        videos = open(args.video_list).readlines()
-        videos = [x.strip() for x in videos]
-
-        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-        fake_anno = f'fake_anno_{timestamp}.txt'
-        with open(fake_anno, 'w') as fout:
-            lines = [x + ' 0' for x in videos]
-            fout.write('\n'.join(lines))
-        fname_tensor = text2tensor(fake_anno, size).cuda()
-
-    if distributed:
-        dist.broadcast(fname_tensor.cuda(), src=0)
-
-    fname = tensor2text(fname_tensor)
-    cfg.data.test.ann_file = fname
-
-    # The flag is used to register module's hooks
-    cfg.setdefault('module_hooks', [])
-
-    # build the dataloader
-    dataset = build_dataset(cfg.data.test, dict(test_mode=True))
-    dataloader_setting = dict(
-        videos_per_gpu=cfg.data.get('videos_per_gpu', 1),
-        workers_per_gpu=cfg.data.get('workers_per_gpu', 1),
-        dist=distributed,
-        shuffle=False)
-
-    dataloader_setting = dict(dataloader_setting,
-                              **cfg.data.get('test_dataloader', {}))
-    data_loader = build_dataloader(dataset, **dataloader_setting)
-
-    outputs = inference_pytorch(args, cfg, distributed, data_loader)
-
-    if rank == 0:
-        if output_config.get('out', None):
-            out = output_config['out']
-            print(f'\nwriting results to {out}')
-            dataset.dump_results(outputs, **output_config)
-        # remove the temporary file
-        os.remove(fake_anno)
+    split_feats(args)
 
 
 if __name__ == '__main__':
diff --git a/tools/visualizations/vis_scheduler.py b/tools/visualizations/vis_scheduler.py
index 17daa34e6b..623d8597c1 100644
--- a/tools/visualizations/vis_scheduler.py
+++ b/tools/visualizations/vis_scheduler.py
@@ -18,6 +18,8 @@
 from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn
 from torch.utils.data import DataLoader
 
+from mmaction.utils import get_str_type
+
 
 def parse_args():
     parser = argparse.ArgumentParser(
@@ -225,7 +227,8 @@ def main():
     if 'by_epoch' in cfg.train_cfg:
         by_epoch = cfg.train_cfg.get('by_epoch')
     elif 'type' in cfg.train_cfg:
-        by_epoch = cfg.train_cfg.get('type') == 'EpochBasedTrainLoop'
+        by_epoch = get_str_type(cfg.train_cfg.get('by_epoch')) \
+                    == 'EpochBasedTrainLoop'
     else:
         raise ValueError('please set `train_cfg`.')