Replace max tensor with max cut (#1295)

Replace MaxTensor with MaxCut for activation mixed precision (Experimental).
sony · Dec 25, 2024 · ceaf820 · ceaf820
1 parent 3c536f2
commit ceaf820
Show file tree

Hide file tree

Showing 26 changed files with 330 additions and 107 deletions.
diff --git a/model_compression_toolkit/core/common/fusion/graph_fuser.py b/model_compression_toolkit/core/common/fusion/graph_fuser.py
@@ -36,10 +36,10 @@ def create_fused_graph(self, graph: Graph) -> Dict[str, str]:
         The fusion process involves:
             1. Creating new fused nodes to represent these groups.
             2. Updating the graph structure to replace the original nodes with fused nodes.
-            3. Maintaining mapping mapping of original node names to their fused node names.
+            3. Maintaining mapping of original node names to their fused node names.
 
         Args:
-            graph: Graph to sue its nodes.
+            graph: Graph to fuse its nodes.
 
         Returns:
             Mapping of original node names to their fused node names
@@ -54,7 +54,8 @@ def create_fused_graph(self, graph: Graph) -> Dict[str, str]:
                 fused_nodes_mapping[node.name] = new_fused_node.name
         return fused_nodes_mapping
 
-    def _create_fused_node(self, nodes: List[BaseNode]) -> BaseNode:
+    @staticmethod
+    def _create_fused_node(nodes: List[BaseNode]) -> BaseNode:
         """
         Create a new node that represents the fusion of the given nodes.
 
@@ -79,10 +80,10 @@ def _create_fused_node(self, nodes: List[BaseNode]) -> BaseNode:
 
         return fused_node
 
-    def _replace_nodes_with_fused_node(self,
-                                      graph: Graph,
-                                      nodes_to_fuse: List[BaseNode],
-                                      fused_node: BaseNode):
+    @staticmethod
+    def _replace_nodes_with_fused_node(graph: Graph,
+                                       nodes_to_fuse: List[BaseNode],
+                                       fused_node: BaseNode):
         """
         Replace the specified nodes in the graph with a new fused node.
 

diff --git a/model_compression_toolkit/core/common/graph/memory_graph/compute_graph_max_cut.py b/model_compression_toolkit/core/common/graph/memory_graph/compute_graph_max_cut.py
@@ -51,13 +51,13 @@ def compute_graph_max_cut(memory_graph: MemoryGraph,
         estimate = (u_bound + l_bound) / 2
         schedule, max_cut_size, cuts = max_cut_astar.solve(estimate_factor=estimate, iter_limit=astar_n_iter)
         if schedule is None:
-            return last_result
+            l_bound = estimate
+        else:
+            u_bound = min(estimate, max_cut_size)
+            last_result = (schedule, max_cut_size, cuts)
 
-        next_u_bound = min(estimate, max_cut_size)
-        last_result = (schedule, max_cut_size, cuts)
-
-        if l_bound * (1 + eps) >= next_u_bound:
-            return last_result
+            if l_bound * (1 + eps) >= u_bound:
+                return last_result
 
         it += 1
 

diff --git a/model_compression_toolkit/core/common/graph/memory_graph/max_cut_astar.py b/model_compression_toolkit/core/common/graph/memory_graph/max_cut_astar.py
@@ -154,6 +154,9 @@ def solve(self, estimate_factor: float, iter_limit: int = 500) -> Tuple[List[Bas
             cut_route = routes[next_cut]
 
             if next_cut == self.target_cut:
+                # TODO maxcut: Why do we filter the cuts (cut_route) but not the max cut size (cut_sost).
+                #              This is a mismatch between max_cut and max(cuts).
+                #              Also, unfiltered cut_route seems perfect, including input and output tensor sizes of current op.
                 return self._remove_dummys_from_path(cut_route[0].op_order), cut_cost,\
                        list(set([self._remove_dummys_from_cut(self.clean_memory_for_next_step(c)) for c in cut_route]))
 
@@ -178,7 +181,8 @@ def solve(self, estimate_factor: float, iter_limit: int = 500) -> Tuple[List[Bas
                 cost = self.accumulate(cut_cost, c.memory_size())
                 if c not in open_list:
                     self._update_expanded_node(c, cost, cut_route, open_list, costs, routes)
-                elif self.ordering(cost, costs[c]):
+                # TODO maxcut: this isn't covered in the coverage test. check if needed and remove no cover
+                elif self.ordering(cost, costs[c]):  # pragma: no cover
                     # If we already saw this cut during the search with a larger cost, then we want to update the order
                     # of the schedule in the cut
                     # Remove call - removes the cut with the same memory elements but different ordering from open
@@ -187,7 +191,8 @@ def solve(self, estimate_factor: float, iter_limit: int = 500) -> Tuple[List[Bas
                     self._update_expanded_node(c, cost, cut_route, open_list, costs, routes)
 
         # Halt or No Solution
-        return None, 0, None
+        # TODO maxcut: this isn't covered in the coverage test. check if needed and remove no cover
+        return None, 0, None  # pragma: no cover
 
     @staticmethod
     def _update_expanded_node(cut: Cut, cost: float, route: List[Cut], open_list: List[Cut],
@@ -223,8 +228,7 @@ def _get_cut_to_expand(self, open_list: List[Cut], costs: Dict[Cut, float], rout
 
         """
         ordered_cuts_list = sorted(open_list,
-                                   key=lambda c: (self.accumulate(costs[c], self.estimate(c, estimate_factor)), len(routes[c])),
-                                   reverse=False)
+                                   key=lambda c: (self.accumulate(costs[c], self.estimate(c, estimate_factor)), -len(routes[c])))
 
         assert len(ordered_cuts_list) > 0
         return ordered_cuts_list[0]
@@ -349,7 +353,8 @@ def ordering(cost_1, cost_2) -> bool:
         Returns: True if the first cost is smaller than the second one, else otherwise.
 
         """
-        return cost_1 < cost_2
+        # TODO maxcut: this isn't covered in the coverage test. check if needed and remove no cover
+        return cost_1 < cost_2  # pragma: no cover
 
     def estimate(self, cut: Cut, estimate_factor: float) -> float:
         """
@@ -377,9 +382,10 @@ def get_init_estimate_factor(memory_graph: MemoryGraph) -> float:
         Returns: An initial estimate value.
 
         """
-        l_bound = memory_graph.memory_lbound_single_op
-        u_bound = 2 * sum([t.total_size for t in memory_graph.b_nodes]) - l_bound
-        return (u_bound + l_bound) / 2
+        # TODO maxcut: this isn't covered in the coverage test. check if needed and remove no cover
+        l_bound = memory_graph.memory_lbound_single_op  # pragma: no cover
+        u_bound = 2 * sum([t.total_size for t in memory_graph.b_nodes]) - l_bound  # pragma: no cover
+        return (u_bound + l_bound) / 2  # pragma: no cover
 
     @staticmethod
     def _remove_dummys_from_path(path: List[BaseNode]) -> List[BaseNode]:

diff --git a/model_compression_toolkit/core/common/graph/memory_graph/memory_element.py b/model_compression_toolkit/core/common/graph/memory_graph/memory_element.py
@@ -30,7 +30,12 @@ def __init__(self, shape: Tuple[Any], node_name: str, node_output_index: int, in
             init_size_to_zero: Whether to initialize the memory tensor size to 0 or not.
         """
 
-        self.shape = shape[1:]  # remove batch size (first element) from output shape
+        # remove batch size (first element) from output shape. If the shape is a list then remove the first
+        # axis. If shape a vector (e.g. output of size) then set the shape minus 1 to ignore the batch value.
+        if len(shape) == 1:
+            self.shape = [] if shape[0] is None else [shape[0] - 1]
+        else:
+            self.shape = shape[1:]
         # The total size of a tensor is considered to be the number of elements in the tensor
         self.total_size = self._get_tensor_total_size() if not init_size_to_zero else 0
 

diff --git a/model_compression_toolkit/core/common/graph/memory_graph/memory_graph.py b/model_compression_toolkit/core/common/graph/memory_graph/memory_graph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 from typing import List
+from operator import getitem
 
 from model_compression_toolkit.core.common import Graph, BaseNode
 from model_compression_toolkit.core.common.graph.edge import EDGE_SOURCE_INDEX
@@ -45,7 +46,8 @@ def __init__(self, model_graph: Graph):
         tensor_to_node = []
 
         for n in nodes:
-            n_outputs = [n.output_shape] if isinstance(n.output_shape, tuple) else n.output_shape
+            n_outputs = n.output_shape if isinstance(n.output_shape[0], (tuple, list)) else [n.output_shape]
+
             out_edges = model_graph.out_edges(n, sort_by_attr=EDGE_SOURCE_INDEX)
 
             for i, ot in enumerate(n_outputs):
@@ -54,7 +56,16 @@ def __init__(self, model_graph: Graph):
                 # Add memory tensor as current node's output
                 node_to_tensor.append((n, memory_tensor))
 
-                ot_edges = [oe for oe in out_edges if oe.source_index == i]
+                # TODO maxcut: refactor this code. it handles split->getitem generated by fx.
+                ot_edges = []
+                for oe in out_edges:
+                    if oe.sink_node.type is getitem and len(oe.sink_node.op_call_args) == 1 and isinstance(oe.sink_node.op_call_args[0], int):
+                        source_index = oe.sink_node.op_call_args[0]
+                    else:
+                        source_index = oe.source_index
+                    if source_index == i:
+                        ot_edges.append(oe)
+
                 for oe in ot_edges:
                     # Add current memory tensor as input to current node's successors
                     tensor_to_node.append((memory_tensor, oe.sink_node))
@@ -71,6 +82,7 @@ def __init__(self, model_graph: Graph):
         inputs_tensors_memory = [sum([t.total_size for t in self.operation_node_children(n)])
                                  for n in nodes if n in model_graph.get_inputs()]
 
+        # TODO maxcut: why both inputs and outputs of each nodes, while the A* solves for node outputs only???
         nodes_total_memory = [sum([t.total_size for t in self.operation_node_children(n)] +
                                   [t.total_size for t in self.operation_node_parents(n)])
                               for n in nodes if n not in model_graph.get_inputs()]

diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py
@@ -24,8 +24,10 @@
 from model_compression_toolkit.core.common.graph.virtual_activation_weights_node import VirtualActivationWeightsNode, \
     VirtualSplitWeightsNode, VirtualSplitActivationNode
 from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import RUTarget, ResourceUtilization
+from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_functions_mapping import RuFunctions
 from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_aggregation_methods import MpRuAggregation
-from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_methods import MpRuMetric
+from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_methods import MpRuMetric, calc_graph_cuts
+from model_compression_toolkit.core.common.graph.memory_graph.compute_graph_max_cut import Cut
 from model_compression_toolkit.core.common.framework_info import FrameworkInfo
 from model_compression_toolkit.core.common.mixed_precision.sensitivity_evaluation import SensitivityEvaluation
 
@@ -40,7 +42,7 @@ def __init__(self,
                  fw_info: FrameworkInfo,
                  fw_impl: FrameworkImplementation,
                  sensitivity_evaluator: SensitivityEvaluation,
-                 ru_functions: Dict[RUTarget, Tuple[MpRuMetric, MpRuAggregation]],
+                 ru_functions: Dict[RUTarget, RuFunctions[MpRuMetric, MpRuAggregation]],
                  target_resource_utilization: ResourceUtilization,
                  original_graph: Graph = None):
         """
@@ -65,8 +67,11 @@ def __init__(self,
         self.sensitivity_evaluator = sensitivity_evaluator
         self.layer_to_bitwidth_mapping = self.get_search_space()
         self.compute_metric_fn = self.get_sensitivity_metric()
+        self._cuts = None
 
-        self.compute_ru_functions = ru_functions
+        ru_types = [ru_target for ru_target, ru_value in
+                    target_resource_utilization.get_resource_utilization_dict().items() if ru_value < np.inf]
+        self.compute_ru_functions = {ru_target: ru_fn for ru_target, ru_fn in ru_functions.items() if ru_target in ru_types}
         self.target_resource_utilization = target_resource_utilization
         self.min_ru_config = self.graph.get_min_candidates_config(fw_info)
         self.max_ru_config = self.graph.get_max_candidates_config(fw_info)
@@ -76,6 +81,17 @@ def __init__(self,
         self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.graph,
                                                                        original_graph=self.original_graph)
 
+    @property
+    def cuts(self) -> List[Cut]:
+        """
+        Calculates graph cuts. Written as property, so it will only be calculated once and
+        only if cuts are needed.
+
+        """
+        if self._cuts is None:
+            self._cuts = calc_graph_cuts(self.original_graph)
+        return self._cuts
+
     def get_search_space(self) -> Dict[int, List[int]]:
         """
         The search space is a mapping from a node's index to a list of integers (possible bitwidths candidates indeces
@@ -106,6 +122,21 @@ def get_sensitivity_metric(self) -> Callable:
 
         return self.sensitivity_evaluator.compute_metric
 
+    def _calc_ru_fn(self, ru_target, ru_fn, mp_cfg) -> np.ndarray:
+        """
+        Computes a resource utilization for a certain mixed precision configuration.
+        The method computes a resource utilization vector for specific target resource utilization.
+
+        Returns: resource utilization value.
+
+        """
+        # ru_fn is a pair of resource utilization computation method and
+        # resource utilization aggregation method (in this method we only need the first one)
+        if ru_target is RUTarget.ACTIVATION:
+            return ru_fn.metric_fn(mp_cfg, self.graph, self.fw_info, self.fw_impl, self.cuts)
+        else:
+            return ru_fn.metric_fn(mp_cfg, self.graph, self.fw_info, self.fw_impl)
+
     def compute_min_ru(self) -> Dict[RUTarget, np.ndarray]:
         """
         Computes a resource utilization vector with the values matching to the minimal mp configuration
@@ -118,10 +149,10 @@ def compute_min_ru(self) -> Dict[RUTarget, np.ndarray]:
 
         """
         min_ru = {}
-        for ru_target, ru_fns in self.compute_ru_functions.items():
-            # ru_fns is a pair of resource utilization computation method and 
+        for ru_target, ru_fn in self.compute_ru_functions.items():
+            # ru_fns is a pair of resource utilization computation method and
             # resource utilization aggregation method (in this method we only need the first one)
-            min_ru[ru_target] = ru_fns[0](self.min_ru_config, self.graph, self.fw_info, self.fw_impl)
+            min_ru[ru_target] = self._calc_ru_fn(ru_target, ru_fn, self.min_ru_config)
 
         return min_ru
 
@@ -212,7 +243,7 @@ def compute_node_ru_for_candidate(self, conf_node_idx: int, candidate_idx: int,
 
         """
         cfg = self.replace_config_in_index(self.min_ru_config, conf_node_idx, candidate_idx)
-        return self.compute_ru_functions[target].metric_fn(cfg, self.graph, self.fw_info, self.fw_impl)
+        return self._calc_ru_fn(target, self.compute_ru_functions[target], cfg)
 
     @staticmethod
     def replace_config_in_index(mp_cfg: List[int], idx: int, value: int) -> List[int]:
@@ -241,13 +272,15 @@ def _non_configurable_nodes_ru(self) -> Dict[RUTarget, np.ndarray]:
         """
 
         non_conf_ru_dict = {}
-        for target, ru_value in self.target_resource_utilization.get_resource_utilization_dict().items():
+        for target, ru_fns in self.compute_ru_functions.items():
             # Call for the ru method of the given target - empty quantization configuration list is passed since we
             # compute for non-configurable nodes
             if target == RUTarget.BOPS:
                 ru_vector = None
+            elif target == RUTarget.ACTIVATION:
+                ru_vector = ru_fns.metric_fn([], self.graph, self.fw_info, self.fw_impl, self.cuts)
             else:
-                ru_vector = self.compute_ru_functions[target].metric_fn([], self.graph, self.fw_info, self.fw_impl)
+                ru_vector = ru_fns.metric_fn([], self.graph, self.fw_info, self.fw_impl)
 
             non_conf_ru_dict[target] = ru_vector
 
@@ -266,14 +299,15 @@ def compute_resource_utilization_for_config(self, config: List[int]) -> Resource
         """
 
         ru_dict = {}
-
         for ru_target, ru_fns in self.compute_ru_functions.items():
             # Passing False to ru methods and aggregations to indicates that the computations
             # are not for constraints setting
             if ru_target == RUTarget.BOPS:
-                configurable_nodes_ru_vector = ru_fns[0](config, self.original_graph, self.fw_info, self.fw_impl, False)
+                configurable_nodes_ru_vector = ru_fns.metric_fn(config, self.original_graph, self.fw_info, self.fw_impl, False)
+            elif ru_target == RUTarget.ACTIVATION:
+                configurable_nodes_ru_vector = ru_fns.metric_fn(config, self.graph, self.fw_info, self.fw_impl, self.cuts)
             else:
-                configurable_nodes_ru_vector = ru_fns[0](config, self.original_graph, self.fw_info, self.fw_impl)
+                configurable_nodes_ru_vector = ru_fns.metric_fn(config, self.original_graph, self.fw_info, self.fw_impl)
             non_configurable_nodes_ru_vector = self.non_conf_ru_dict.get(ru_target)
             if non_configurable_nodes_ru_vector is None or len(non_configurable_nodes_ru_vector) == 0:
                 ru_ru = self.compute_ru_functions[ru_target].aggregate_fn(configurable_nodes_ru_vector, False)
@@ -647,7 +681,7 @@ def get_weights_for_split_activation(self,
                 # It's ok, need to find the node's configuration
                 self.retrieve_weights_activation_config(activation_node, weights_node, virtual_node, virtual_cfg_idx, virtual_mp_cfg)
             else:
-                Logger.critical(f"Virtual graph configuration error: Expected the predecessor of node '{n.name}' to have multiple outputs when not composed with an activation node.")  # pragma: no cover
+                Logger.critical(f"Virtual graph configuration error: Expected the predecessor of node '{weights_node.name}' to have multiple outputs when not composed with an activation node.")  # pragma: no cover
 
     def update_config_at_original_idx(self, n: BaseNode, origin_cfg_idx: int):
         """