From 69b95feb35065a33b53b66bd3f3c23c3957305ae Mon Sep 17 00:00:00 2001
From: Mohammed Innat <innat.dev@gmail.com>
Date: Tue, 26 Mar 2024 12:15:20 +0600
Subject: [PATCH] update content

---
 MODEL_ZOO.md | 12 ++++++------
 README.md    | 39 +++++++++++++++++++--------------------
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md
index ccdaeb8..efea0a6 100644
--- a/MODEL_ZOO.md
+++ b/MODEL_ZOO.md
@@ -19,20 +19,20 @@ In the training phase, the video swin mdoels are initialized with the pretrained
 
 | Model |  Pretrain  | #Frame | Top-1 | Top-5 | Checkpoints | config |
 | :---: | :---: | :---: | :---: | :---: | :---: | :---: | 
-|  Swin-T  | IN-1K  | 32x4x3 | 78.8  |  93.6  |   [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_tiny_kinetics400.weights.h5) | [swin-t](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_tiny_patch244_window877_kinetics400_1k.py)  |
-|  Swin-S  | IN-1K  | 32x4x3 | 80.6  |  94.5  |   [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_small_kinetics400.weights.h5) | [swin-s](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_small_patch244_window877_kinetics400_1k.py)  |
-|  Swin-B  | IN-1K  | 32x4x3 | 80.6  |  94.6  |  [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_base_kinetics400.weights.h5) | [swin-b](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_base_patch244_window877_kinetics400_1k.py)  |
-|  Swin-B  | IN-22K | 32x4x3 | 82.7  |  95.5  |   [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_base_kinetics400_imagenet22k.weights.h5) | [swin-b](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_base_patch244_window877_kinetics400_22k.py)  |
+|  Swin-T  | IN-1K  | 32x4x3 | 78.8  |  93.6  |   [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_tiny_kinetics400_classifier.weights.h5) | [swin-t](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_tiny_patch244_window877_kinetics400_1k.py)  |
+|  Swin-S  | IN-1K  | 32x4x3 | 80.6  |  94.5  |   [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_small_kinetics400_classifier.weights.h5) | [swin-s](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_small_patch244_window877_kinetics400_1k.py)  |
+|  Swin-B  | IN-1K  | 32x4x3 | 80.6  |  94.6  |  [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_base_kinetics400_classifier.weights.h5) | [swin-b](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_base_patch244_window877_kinetics400_1k.py)  |
+|  Swin-B  | IN-22K | 32x4x3 | 82.7  |  95.5  |   [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_base_kinetics400_imagenet22k_classifier.weights.h5) | [swin-b](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_base_patch244_window877_kinetics400_22k.py)  |
 
 ### Kinetics 600
 
 | Model |  Pretrain  | #Frame | Top-1 | Top-5 | Checkpoints | config |
 | :---: | :---: | :---: | :---: | :---: | :---: | :---: | 
-|  Swin-B  | IN-22K | 32x4x3 | 84.0  |  96.5  |   [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_base_kinetics600_imagenet22k.weights.h5)  |  [swin-b](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_base_patch244_window877_kinetics600_22k.py)  | 
+|  Swin-B  | IN-22K | 32x4x3 | 84.0  |  96.5  |   [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_base_kinetics600_imagenet22k_classifier.weights.h5)  |  [swin-b](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_base_patch244_window877_kinetics600_22k.py)  | 
 
 ### Something-Something V2
 
 | Model |  Pretrain  | #Frame | Top-1 | Top-5 | Checkpoints | config |
 | :---: | :---: | :---: | :---: | :---: | :---: | :---: | 
-|  Swin-B  | Kinetics 400 | 32x1x3 | 69.6  |  92.7  |  [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_base_something_something_v2.weights.h5)  |  [swin-b](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_base_patch244_window1677_sthv2.py)  |
+|  Swin-B  | Kinetics 400 | 32x1x3 | 69.6  |  92.7  |  [h5](https://github.com/innat/VideoSwin/releases/download/v2.0/videoswin_base_something_something_v2_classifier.weights.h5)  |  [swin-b](https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/swin/swin_base_patch244_window1677_sthv2.py)  |
 
diff --git a/README.md b/README.md
index 14cb7c3..1074f58 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ This is a unofficial `Keras 3` implementation of [Video Swin transformers](https
 
 # Checkpoints
 
-The **VideoSwin** checkpoints are available in `.weights.h5` for Kinetrics 400/600 and Something Something V2 datasets. The variants of this models are `tiny`, `small`, and `base`. Check [model zoo](https://github.com/innat/VideoSwin/blob/main/MODEL_ZOO.md) page to know details of it. 
+The **VideoSwin** checkpoints are available in `.weights.h5`format for Kinetrics 400/600 and Something Something V2 datasets. The variants of this models are `tiny`, `small`, and `base`. Check [model zoo](https://github.com/innat/VideoSwin/blob/main/MODEL_ZOO.md) page to know details of it. 
 
 
 # Inference
@@ -31,32 +31,31 @@ The **VideoSwin** checkpoints are available in `.weights.h5` for Kinetrics 400/6
 A sample usage is shown below. We can pick any backend, i.e. tensorflow, torch or jax.
 
 ```python
->>> import  os
->>> import torch
->>> os.environ["KERAS_BACKEND"] = "torch"
->>> from videoswin import VideoSwinT
+import  os
+os.environ["KERAS_BACKEND"] = "torch"
+import torch
+from videoswin import VideoSwinT
 
->>> model = VideoSwinT(
+model = VideoSwinT(
     num_classes=400,
     include_rescaling=False,
     activation=None
 )
->>> _ = model(torch.ones((1, 32, 224, 224, 3)))
->>> model.load_weights('model.weights.h5')
+model.load_weights('model.weights.h5')
 
->>> container = read_video('sample.mp4')
->>> frames = frame_sampling(container, num_frames=32)
->>> y_pred = model(frames)
->>> y_pred.shape
+container = read_video('sample.mp4')
+frames = frame_sampling(container, num_frames=32)
+y_pred = model(frames)
+y_pred.shape
 TensorShape([1, 400])
 
->>> probabilities = torch.nn.functional.softmax(y_pred).detach().numpy()
->>> probabilities = probabilities.squeeze(0)
->>> confidences = {
+probabilities = torch.nn.functional.softmax(y_pred).detach().numpy()
+probabilities = probabilities.squeeze(0)
+confidences = {
     label_map_inv[i]: float(probabilities[i]) \
     for i in np.argsort(probabilities)[::-1]
 }
->>> confidences
+confidences
 ```
 A classification results on a sample from [Kinetics-400](https://paperswithcode.com/dataset/kinetics-400-1).
 
@@ -89,10 +88,10 @@ model.load_weights('model.weights.h5', skip_mismatch=True)
 **Guides**
 
 - To ensure the keras reimplementation with official torch: [logit comparison](guides/video-swin-transformer-keras-and-torchvision.ipynb)
-- To train with tensorflow backend:
-- To trian with torch backend:
-- To train with jax backend:
-- To train with torch-lightening (torch backend):
+- To train with tensorflow backend: [code](guides/tf_videoswin_video_classification.ipynb)
+- To trian with torch backend: [code](guides/torch_videoswin_video_classification.ipynb)
+- To train with jax backend: [code](guides/jax_videoswin_video_classification.ipynb)
+- To train with torch-lightening (torch backend): [code](guides/torch_lightning_videoswin_video_classification.ipynb)
 
 
 ##  Citation