weighting is already done when computing combine_tensor

lucidrains · Aug 21, 2023 · 4a073bc · 4a073bc
1 parent 0b6bf96
commit 4a073bc
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 3 deletions.
diff --git a/mixture_of_experts/mixture_of_experts.py b/mixture_of_experts/mixture_of_experts.py
@@ -199,15 +199,13 @@ def forward(self, x, importance = None):
         # [batch, group]
         position_in_expert_1 = position_in_expert_1.sum(dim=-1)
         # Weight assigned to first expert.  [batch, group]
-        gate_1 *= mask_1_flat
 
         position_in_expert_2 = cumsum_exclusive(mask_2, dim=-2) + mask_1_count
         position_in_expert_2 *= mask_2
         mask_2 *= (position_in_expert_2 < expert_capacity_f).float()
         mask_2_flat = mask_2.sum(dim=-1)
 
         position_in_expert_2 = position_in_expert_2.sum(dim=-1)
-        gate_2 *= mask_2_flat
 
         # [batch, group, experts, expert_capacity]
         combine_tensor = (

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'mixture-of-experts',
   packages = find_packages(),
-  version = '0.2.1',
+  version = '0.2.2',
   license='MIT',
   description = 'Sparsely-Gated Mixture of Experts for Pytorch',
   author = 'Phil Wang',