From 44f304f731759fb37a6391c8aa448f1917eb49aa Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Thu, 29 Feb 2024 11:41:12 +0000
Subject: [PATCH 01/21] Implement outlet matching -Update and test KStest

---
 swmmanywhere/metric_utilities.py | 82 ++++++++++++++++++++++++++++++--
 tests/test_metric_utilities.py   | 77 ++++++++++++++++++++++++++----
 2 files changed, 148 insertions(+), 11 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index a01c4842..942d6ba0 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -96,12 +96,88 @@ class kstest_betweenness(BaseMetric):
     def __call__(self, 
                  synthetic_G: nx.Graph,
                  real_G: nx.Graph,
+                 real_subs: gpd.GeoDataFrame,
+                 real_results: pd.DataFrame,
                  *args,
                  **kwargs) -> float:
         """Run the evaluated metric."""
-        syn_betweenness = nx.betweenness_centrality(synthetic_G)
-        real_betweenness = nx.betweenness_centrality(real_G)
+        # Identify synthetic outlet and subgraph
+        sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
+        
+        # Identify real outlet and subgraph
+        sg_real, _ = dominant_outlet(real_G, real_results)
+
+        syn_betweenness = nx.betweenness_centrality(sg_syn)
+        real_betweenness = nx.betweenness_centrality(sg_real)
 
         #TODO does it make more sense to use statistic or pvalue?
         return stats.ks_2samp(list(syn_betweenness.values()),
-                              list(real_betweenness.values())).statistic
\ No newline at end of file
+                              list(real_betweenness.values())).statistic
+
+def best_outlet_match(synthetic_G: nx.Graph,
+                      real_subs: gpd.GeoDataFrame) -> tuple[nx.Graph,int]:
+    """Best outlet match.
+    
+    Identify the outlet with the most nodes within the real_subs and return the
+    subgraph of the synthetic graph of nodes that drain to that outlet.
+
+    Args:
+        synthetic_G (nx.Graph): The synthetic graph.
+        real_subs (gpd.GeoDataFrame): The real subcatchments.
+
+    Returns:
+        nx.Graph: The subgraph of the synthetic graph for the outlet with the 
+            most nodes within the real_subs.
+        int: The id of the outlet.
+    """
+    # Identify which nodes fall within real_subs
+    nodes_df = pd.DataFrame([d for x,d in synthetic_G.nodes(data=True)],
+                             index = synthetic_G.nodes)
+    nodes_gdf = gpd.GeoDataFrame(nodes_df, 
+                                 geometry=gpd.points_from_xy(nodes_df.x, 
+                                                             nodes_df.y),
+                                 crs = synthetic_G.graph['crs'])
+    nodes_joined = nodes_gdf.sjoin(real_subs, 
+                                   how="right", 
+                                   predicate="intersects")
+
+    # Select the most common outlet
+    outlet = nodes_joined.outlet.value_counts().idxmax()
+
+    # Subselect the matching graph
+    outlet_nodes = [n for n, d in synthetic_G.nodes(data=True) 
+                    if d['outlet'] == outlet]
+    return synthetic_G.subgraph(outlet_nodes), outlet
+
+def dominant_outlet(G: nx.Graph,
+                    results: pd.DataFrame) -> tuple[nx.Graph,int]:
+    """Dominant outlet.
+
+    Identify the outlet with highest flow along it and return the
+    subgraph of the graph of nodes that drain to that outlet.
+
+    Args:
+        G (nx.Graph): The graph.
+        results (pd.DataFrame): The results, which include a 'flow' and 'id' 
+            column.
+
+    Returns:
+        nx.Graph: The subgraph of nodes/arcs that the reach max flow outlet
+        int: The id of the outlet.
+    """
+    # Identify outlets of the graph
+    outlets = [n for n, outdegree in G.out_degree() 
+               if outdegree == 0]
+    outlet_arcs = [d['id'] for u,v,d in G.edges(data=True) 
+                   if v in outlets]
+    
+    # Identify the outlet with the highest flow
+    outlet_flows = results.loc[(results.variable == 'flow') &
+                               (results.object.isin(outlet_arcs))]
+    max_outlet_arc = outlet_flows.groupby('object').value.mean().idxmax()
+    max_outlet = [v for u,v,d in G.edges(data=True) 
+                  if d['id'] == max_outlet_arc][0]
+    
+    # Subselect the matching graph
+    sg = G.subgraph(nx.ancestors(G, max_outlet) | {max_outlet})
+    return sg, max_outlet
\ No newline at end of file
diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py
index 5cb599ab..b4368363 100644
--- a/tests/test_metric_utilities.py
+++ b/tests/test_metric_utilities.py
@@ -1,9 +1,12 @@
 from pathlib import Path
 
+import geopandas as gpd
+import networkx as nx
 import pandas as pd
+import shapely
 
+from swmmanywhere import metric_utilities as mu
 from swmmanywhere.graph_utilities import load_graph
-from swmmanywhere.metric_utilities import metrics as sm
 
 
 def test_bias_flood_depth():
@@ -31,19 +34,77 @@ def test_bias_flood_depth():
     })
 
     # Run the metric
-    val = sm.bias_flood_depth(synthetic_results = synthetic_results, 
+    val = mu.metrics.bias_flood_depth(synthetic_results = synthetic_results, 
                               real_results = real_results,
                               synthetic_subs = synthetic_subs,
                               real_subs = real_subs)
     assert val == -0.29523809523809524
 
-def test_kstest_betweenness():
-    """Test the kstest_betweenness metric."""
+def test_best_outlet_match():
+    """Test the best_outlet_match and ks_betweenness."""
     G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')
-    val = sm.kstest_betweenness(synthetic_G = G, real_G = G)
+    subs = [shapely.Polygon([(700262, 5709928),
+                            (700262, 5709883),
+                            (700351, 5709883),
+                            (700351, 5709906),
+                            (700306, 5709906),
+                            (700306, 5709928),
+                            (700262, 5709928)]),
+            shapely.Polygon([(700306, 5709928),
+                            (700284, 5709928),
+                            (700284, 5709950),
+                            (700374, 5709950),
+                            (700374, 5709906),
+                            (700351, 5709906),
+                            (700306, 5709906),
+                            (700306, 5709928)]),
+            shapely.Polygon([(700351, 5709883),
+                            (700351, 5709906),
+                            (700374, 5709906),
+                            (700374, 5709883),
+                            (700396, 5709883),
+                            (700396, 5709816),
+                            (700329, 5709816),
+                            (700329, 5709838),
+                            (700329, 5709883),
+                            (700351, 5709883)])]
+
+    subs = gpd.GeoDataFrame(data = {'id' : [107733,
+                                            1696030874,
+                                            6277683849]
+                                            },
+                                    geometry = subs,
+                                    crs = G.graph['crs'])
+    
+    sg, outlet = mu.best_outlet_match(synthetic_G = G, 
+                                     real_subs = subs)
+    outlets = nx.get_node_attributes(sg, 'outlet')
+    assert len(set(outlets.values())) == 1
+    assert outlet == 12354833
+
+    results = pd.DataFrame([{'object' : 4253560,
+                             'variable' : 'flow',
+                             'value' : 10},
+                            {'object' : '',
+                             'variable' : 'flow',
+                             'value' : 5}])
+
+    val = mu.metrics.kstest_betweenness(synthetic_G = G, 
+                                 real_G = G, 
+                                 real_subs = subs,
+                                 real_results = results)
+    
     assert val == 0.0
 
+    # Move the outlet up one node
     G_ = G.copy()
-    G_.remove_node(list(G.nodes)[0])
-    val = sm.kstest_betweenness(synthetic_G = G_, real_G = G)
-    assert val == 0.286231884057971
\ No newline at end of file
+    nx.set_node_attributes(G_,
+                           list(G_.in_edges(outlet))[0][0],
+                           'outlet')
+    G_.remove_node(outlet)
+
+    val = mu.metrics.kstest_betweenness(synthetic_G = G_,
+                                real_G = G,
+                                real_subs = subs,
+                                real_results = results)
+    assert val == 0.4

From 1c829e9592d2165713720a14be009cf2f0e8ef74 Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Thu, 29 Feb 2024 16:21:33 +0000
Subject: [PATCH 02/21] Implement outlet matching

-Update and test KStest
-Add and test flow/flooding for a 'dominant' outlet
---
 swmmanywhere/metric_utilities.py | 105 ++++++++++++++
 tests/test_metric_utilities.py   | 235 ++++++++++++++++++++++++++-----
 2 files changed, 308 insertions(+), 32 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 942d6ba0..8c2da18d 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -113,6 +113,111 @@ def __call__(self,
         #TODO does it make more sense to use statistic or pvalue?
         return stats.ks_2samp(list(syn_betweenness.values()),
                               list(real_betweenness.values())).statistic
+    
+@register_metric
+class outlet_nse_flow(BaseMetric):
+    """Outlet NSE flow.
+
+    Calculate the Nash-Sutcliffe efficiency (NSE) of flow over time, where flow
+    is measured as the total flow of all arcs that drain to the 'dominant'
+    outlet node. The dominant outlet node of the 'real' network is calculated by
+    dominant_outlet, while the dominant outlet node of the 'synthetic' network
+    is calculated by best_outlet_match.
+    """
+    def __call__(self,
+                  synthetic_G: nx.Graph,
+                  synthetic_results: pd.DataFrame,
+                  real_G: nx.Graph,
+                  real_results: pd.DataFrame,
+                  real_subs: gpd.GeoDataFrame,
+                  *args,
+                  **kwargs) -> float:
+        """Run the evaluated metric."""
+        # Identify synthetic and real arcs that flow into the best outlet node
+        _, syn_outlet = best_outlet_match(synthetic_G, real_subs)
+        syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True)
+                   if v == syn_outlet]
+        _, real_outlet = dominant_outlet(real_G, real_results)
+        real_arc = [d['id'] for u,v,d in real_G.edges(data=True)
+                    if v == real_outlet]
+        
+        # Extract flow data
+        syn_flow = synthetic_results.loc[(synthetic_results.variable == 'flow') &
+                                        (synthetic_results.object.isin(syn_arc))]
+        syn_flow = syn_flow.groupby('date').value.sum().reset_index()
+
+        real_flow = real_results.loc[(real_results.variable == 'flow') &
+                                        (real_results.object.isin(real_arc))]
+        real_flow = real_flow.groupby('date').value.sum().reset_index()
+
+        # Align data
+        df = pd.merge(syn_flow, 
+                      real_flow, 
+                      on = 'date', 
+                      suffixes = ('_syn','_real'),
+                      how = 'outer')
+        df = df.sort_values(by='date')
+
+        # Interpolate to time in real data
+        df['value_syn'] = df.set_index('date').value_syn.interpolate().values
+        df = df.dropna(subset = 'value_real')
+
+        # Calculate NSE
+        return nse(df.value_real, df.value_syn)
+
+@register_metric
+class outlet_nse_flooding(BaseMetric):
+    """Outlet NSE flooding.
+    
+    Calculate the Nash-Sutcliffe efficiency (NSE) of flooding over time, where
+    flooding is the total volume of flooded water across all nodes that drain
+    to the 'dominant' outlet node. The dominant outlet node of the 'real' 
+    network is calculated by dominant_outlet, while the dominant outlet node of
+    the 'synthetic' network is calculated by best_outlet_match.
+    """
+    def __call__(self,
+                  synthetic_G: nx.Graph,
+                  synthetic_results: pd.DataFrame,
+                  real_G: nx.Graph,
+                  real_results: pd.DataFrame,
+                  real_subs: gpd.GeoDataFrame,
+                  *args,
+                  **kwargs) -> float:
+        """Run the evaluated metric."""
+        # Identify synthetic and real outlet arcs
+        sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
+        sg_real, _ = dominant_outlet(real_G, real_results)
+        
+        # Extract flooding data
+        syn_flood = synthetic_results.loc[(synthetic_results.variable == 'flooding') &
+                                        synthetic_results.object.isin(sg_syn.nodes)]
+        
+        real_flood = real_results.loc[(real_results.variable == 'flooding') &
+                                        real_results.object.isin(sg_real.nodes)]
+        
+        # Aggregate to total flooding over network
+        syn_flood = syn_flood.groupby('date').value.sum().reset_index()
+        real_flood = real_flood.groupby('date').value.sum().reset_index()
+
+        # Align data
+        df = pd.merge(syn_flood, 
+                      real_flood, 
+                      on = 'date', 
+                      suffixes = ('_syn','_real'),
+                      how = 'outer')
+        df = df.sort_values(by='date')
+
+        # Interpolate to time in real data
+        df['value_syn'] = df.set_index('date').value_syn.interpolate().values
+        df = df.dropna(subset='value_real')
+
+        # Calculate NSE
+        return nse(df.value_real, df.value_syn)
+
+def nse(y: np.ndarray,
+        yhat: np.ndarray) -> float:
+    """Calculate Nash-Sutcliffe efficiency (NSE)."""
+    return 1 - np.sum((y - yhat)**2) / np.sum((y - np.mean(y))**2)
 
 def best_outlet_match(synthetic_G: nx.Graph,
                       real_subs: gpd.GeoDataFrame) -> tuple[nx.Graph,int]:
diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py
index b4368363..4efd921e 100644
--- a/tests/test_metric_utilities.py
+++ b/tests/test_metric_utilities.py
@@ -2,6 +2,7 @@
 
 import geopandas as gpd
 import networkx as nx
+import numpy as np
 import pandas as pd
 import shapely
 
@@ -9,6 +10,42 @@
 from swmmanywhere.graph_utilities import load_graph
 
 
+def get_subs():
+    """Get a GeoDataFrame of subcatchments."""
+    subs = [shapely.Polygon([(700262, 5709928),
+                            (700262, 5709883),
+                            (700351, 5709883),
+                            (700351, 5709906),
+                            (700306, 5709906),
+                            (700306, 5709928),
+                            (700262, 5709928)]),
+            shapely.Polygon([(700306, 5709928),
+                            (700284, 5709928),
+                            (700284, 5709950),
+                            (700374, 5709950),
+                            (700374, 5709906),
+                            (700351, 5709906),
+                            (700306, 5709906),
+                            (700306, 5709928)]),
+            shapely.Polygon([(700351, 5709883),
+                            (700351, 5709906),
+                            (700374, 5709906),
+                            (700374, 5709883),
+                            (700396, 5709883),
+                            (700396, 5709816),
+                            (700329, 5709816),
+                            (700329, 5709838),
+                            (700329, 5709883),
+                            (700351, 5709883)])]
+
+    subs = gpd.GeoDataFrame(data = {'id' : [107733,
+                                            1696030874,
+                                            6277683849]
+                                            },
+                                    geometry = subs,
+                                    crs = 'EPSG:32630')
+    return subs
+
 def test_bias_flood_depth():
     """Test the bias_flood_depth metric."""
     # Create synthetic and real data
@@ -43,38 +80,7 @@ def test_bias_flood_depth():
 def test_best_outlet_match():
     """Test the best_outlet_match and ks_betweenness."""
     G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')
-    subs = [shapely.Polygon([(700262, 5709928),
-                            (700262, 5709883),
-                            (700351, 5709883),
-                            (700351, 5709906),
-                            (700306, 5709906),
-                            (700306, 5709928),
-                            (700262, 5709928)]),
-            shapely.Polygon([(700306, 5709928),
-                            (700284, 5709928),
-                            (700284, 5709950),
-                            (700374, 5709950),
-                            (700374, 5709906),
-                            (700351, 5709906),
-                            (700306, 5709906),
-                            (700306, 5709928)]),
-            shapely.Polygon([(700351, 5709883),
-                            (700351, 5709906),
-                            (700374, 5709906),
-                            (700374, 5709883),
-                            (700396, 5709883),
-                            (700396, 5709816),
-                            (700329, 5709816),
-                            (700329, 5709838),
-                            (700329, 5709883),
-                            (700351, 5709883)])]
-
-    subs = gpd.GeoDataFrame(data = {'id' : [107733,
-                                            1696030874,
-                                            6277683849]
-                                            },
-                                    geometry = subs,
-                                    crs = G.graph['crs'])
+    subs = get_subs()
     
     sg, outlet = mu.best_outlet_match(synthetic_G = G, 
                                      real_subs = subs)
@@ -108,3 +114,168 @@ def test_best_outlet_match():
                                 real_subs = subs,
                                 real_results = results)
     assert val == 0.4
+
+def test_nse():
+    """Test the nse metric."""
+    val = mu.nse(y = np.array([1,2,3,4,5]),
+                         yhat = np.array([1,2,3,4,5]))
+    assert val == 1.0
+
+    val = mu.nse(y = np.array([1,2,3,4,5]),
+                         yhat = np.array([3,3,3,3,3]))
+    assert val == 0.0
+
+def test_outlet_nse_flow():
+    """Test the outlet_nse_flow metric."""
+    # Load data
+    G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')
+    subs = get_subs()
+
+    # Mock results
+    results = pd.DataFrame([{'object' : 4253560,
+                             'variable' : 'flow',
+                             'value' : 10,
+                             'date' : pd.to_datetime('2021-01-01 00:00:00')},
+                            {'object' : '',
+                             'variable' : 'flow',
+                             'value' : 5,
+                             'date' : pd.to_datetime('2021-01-01 00:00:00')},
+                             {'object' : 4253560,
+                             'variable' : 'flow',
+                             'value' : 5,
+                             'date' : pd.to_datetime('2021-01-01 00:00:05')},
+                            {'object' : '',
+                             'variable' : 'flow',
+                             'value' : 2,
+                             'date' : pd.to_datetime('2021-01-01 00:00:05')}])
+    
+    # Calculate NSE (perfect results)
+    val = mu.metrics.outlet_nse_flow(synthetic_G = G,
+                                    synthetic_results = results,
+                                    real_G = G,
+                                    real_results = results,
+                                    real_subs = subs)
+    assert val == 1.0
+
+    # Calculate NSE (mean results)
+    results_ = results.copy()
+    results_.loc[[0,2],'value'] = 7.5
+    val = mu.metrics.outlet_nse_flow(synthetic_G = G,
+                                    synthetic_results = results_,
+                                    real_G = G,
+                                    real_results = results,
+                                    real_subs = subs)
+    assert val == 0.0
+
+    # Change the graph
+    G_ = G.copy()
+    new_outlet = list(G_.in_edges(12354833))[0][0]
+    nx.set_node_attributes(G_,
+                           new_outlet,
+                           'outlet')
+    G_.remove_node(12354833)
+    results_.loc[results_.object == 4253560, 'object'] = 725226531
+
+    # Calculate NSE (mean results)
+    val = mu.metrics.outlet_nse_flow(synthetic_G = G_,
+                                    synthetic_results = results_,
+                                    real_G = G,
+                                    real_results = results,
+                                    real_subs = subs)
+    assert val == 0.0
+
+    # Test time interpolation
+    results_.loc[2,'date'] = pd.to_datetime('2021-01-01 00:00:10')
+    results_.loc[[0,2], 'value'] = [0,30]
+    val = mu.metrics.outlet_nse_flow(synthetic_G = G_,
+                                    synthetic_results = results_,
+                                    real_G = G,
+                                    real_results = results,
+                                    real_subs = subs)
+    assert val == -15.0
+
+def test_outlet_nse_flooding():
+    """Test the outlet_nse_flow metric."""
+    # Load data
+    G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')
+    subs = get_subs()
+
+    # Mock results
+    results = pd.DataFrame([{'object' : 4253560,
+                             'variable' : 'flow',
+                             'value' : 10,
+                             'date' : pd.to_datetime('2021-01-01 00:00:00')},
+                             {'object' : 4253560,
+                             'variable' : 'flow',
+                             'value' : 5,
+                             'date' : pd.to_datetime('2021-01-01 00:00:05')},
+                             {'object' : 25472468,
+                             'variable' : 'flooding',
+                             'value' : 4.5,
+                             'date' : pd.to_datetime('2021-01-01 00:00:00')},
+                            {'object' : 770549936,
+                             'variable' : 'flooding',
+                             'value' : 5,
+                             'date' : pd.to_datetime('2021-01-01 00:00:00')},
+                            {'object' : 109753,
+                             'variable' : 'flooding',
+                             'value' : 10,
+                             'date' : pd.to_datetime('2021-01-01 00:00:00')},
+                             {'object' : 25472468,
+                             'variable' : 'flooding',
+                             'value' : 0,
+                             'date' : pd.to_datetime('2021-01-01 00:00:05')},
+                            {'object' : 770549936,
+                             'variable' : 'flooding',
+                             'value' : 5,
+                             'date' : pd.to_datetime('2021-01-01 00:00:05')},
+                            {'object' : 109753,
+                             'variable' : 'flooding',
+                             'value' : 15,
+                             'date' : pd.to_datetime('2021-01-01 00:00:05')}])
+    
+    # Calculate NSE (perfect results)
+    val = mu.metrics.outlet_nse_flooding(synthetic_G = G,
+                                    synthetic_results = results,
+                                    real_G = G,
+                                    real_results = results,
+                                    real_subs = subs)
+    assert val == 1.0
+
+    # Calculate NSE (mean results)
+    results_ = results.copy()
+    results_.loc[results_.object.isin([770549936, 25472468]),'value'] = [14.5 / 4] * 4
+    val = mu.metrics.outlet_nse_flooding(synthetic_G = G,
+                                    synthetic_results = results_,
+                                    real_G = G,
+                                    real_results = results,
+                                    real_subs = subs)
+    assert val == 0.0
+
+    # Change the graph
+    G_ = G.copy()
+    new_outlet = list(G_.in_edges(12354833))[0][0]
+    nx.set_node_attributes(G_,
+                           {x : new_outlet for x,d in G_.nodes(data = True) 
+                            if d['outlet'] == 12354833},
+                           'outlet')
+    G_.remove_node(12354833)
+
+    # Calculate NSE (mean results)
+    val = mu.metrics.outlet_nse_flooding(synthetic_G = G_,
+                                    synthetic_results = results_,
+                                    real_G = G,
+                                    real_results = results,
+                                    real_subs = subs)
+    assert val == 0.0
+
+    # Test time interpolation
+    results_.loc[results_.date == pd.to_datetime('2021-01-01 00:00:05'),
+                 'date'] = pd.to_datetime('2021-01-01 00:00:10')
+    
+    val = mu.metrics.outlet_nse_flooding(synthetic_G = G_,
+                                    synthetic_results = results_,
+                                    real_G = G,
+                                    real_results = results,
+                                    real_subs = subs)
+    assert val == 0.0
\ No newline at end of file

From d8bcd0da9aafe25d7895dde3d985191507db741e Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Wed, 6 Mar 2024 11:45:50 +0000
Subject: [PATCH 03/21] Update metric_utilities.py

update to merge with `metric_format`
---
 swmmanywhere/metric_utilities.py | 244 ++++++++++++++-----------------
 1 file changed, 113 insertions(+), 131 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 8c2da18d..d57f0823 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -3,7 +3,7 @@
 
 @author: Barnaby Dobson
 """
-from abc import ABC, abstractmethod
+from inspect import signature
 from typing import Callable
 
 import geopandas as gpd
@@ -13,25 +13,33 @@
 from scipy import stats
 
 
-class BaseMetric(ABC):
-    """Base metric class."""
-    @abstractmethod
-    def __call__(self, 
-                 *args,
-                 **kwargs) -> float:
-        """Run the evaluated metric."""
-        return 0
-
 class MetricRegistry(dict): 
     """Registry object.""" 
     
-    def register(self, cls):
+    def register(self, func: Callable) -> Callable:
         """Register a metric."""
-        if cls.__name__ in self:
-            raise ValueError(f"{cls.__name__} already in the metric registry!")
-
-        self[cls.__name__] = cls()
-        return cls
+        if func.__name__ in self:
+            raise ValueError(f"{func.__name__} already in the metric registry!")
+
+        allowable_params = {"synthetic_results": pd.DataFrame,
+                            "real_results": pd.DataFrame,
+                            "synthetic_subs": gpd.GeoDataFrame,
+                            "real_subs": gpd.GeoDataFrame,
+                            "synthetic_G": nx.Graph,
+                            "real_G": nx.Graph}
+
+        sig = signature(func)
+        for param, obj in sig.parameters.items():
+            if param == 'kwargs':
+                continue
+            if param not in allowable_params:
+                raise ValueError(f"{param} of {func.__name__} not allowed.")
+            if obj.annotation != allowable_params[param]:
+                raise ValueError(f"""{param} of {func.__name__} should be of
+                                 type {allowable_params[param]}, not 
+                                 {obj.__class__}.""")
+        self[func.__name__] = func
+        return func
 
     def __getattr__(self, name):
         """Get a metric from the graphfcn dict."""
@@ -43,18 +51,6 @@ def __getattr__(self, name):
 
 metrics = MetricRegistry()
 
-def register_metric(cls) -> Callable:
-    """Register a metric.
-
-    Args:
-        cls (Callable): A class that inherits from BaseMetric
-
-    Returns:
-        cls (Callable): The same class
-    """
-    metrics.register(cls)
-    return cls
-
 def extract_var(df: pd.DataFrame,
                      var: str) -> pd.DataFrame:
     """Extract var from a dataframe."""
@@ -63,15 +59,12 @@ def extract_var(df: pd.DataFrame,
                         df_.date.min()).dt.total_seconds()
     return df_
 
-@register_metric
-class bias_flood_depth(BaseMetric):
-    """Bias flood depth."""
-    def __call__(self, 
+@metrics.register
+def bias_flood_depth(
                  synthetic_results: pd.DataFrame,
                  real_results: pd.DataFrame,
                  synthetic_subs: gpd.GeoDataFrame,
                  real_subs: gpd.GeoDataFrame,
-                 *args,
                  **kwargs) -> float:
         """Run the evaluated metric."""
         
@@ -90,32 +83,34 @@ def _f(x):
 
         return (syn_tot - real_tot) / real_tot
 
-@register_metric
-class kstest_betweenness(BaseMetric):
+@metrics.register
+def kstest_betweenness(
+                synthetic_G: nx.Graph,
+                real_G: nx.Graph,
+                real_subs: gpd.GeoDataFrame,
+                real_results: pd.DataFrame,
+                **kwargs) -> float:
     """KS two sided of betweenness distribution."""
-    def __call__(self, 
-                 synthetic_G: nx.Graph,
-                 real_G: nx.Graph,
-                 real_subs: gpd.GeoDataFrame,
-                 real_results: pd.DataFrame,
-                 *args,
-                 **kwargs) -> float:
-        """Run the evaluated metric."""
-        # Identify synthetic outlet and subgraph
-        sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
-        
-        # Identify real outlet and subgraph
-        sg_real, _ = dominant_outlet(real_G, real_results)
+    # Identify synthetic outlet and subgraph
+    sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
+    
+    # Identify real outlet and subgraph
+    sg_real, _ = dominant_outlet(real_G, real_results)
 
-        syn_betweenness = nx.betweenness_centrality(sg_syn)
-        real_betweenness = nx.betweenness_centrality(sg_real)
+    syn_betweenness = nx.betweenness_centrality(sg_syn)
+    real_betweenness = nx.betweenness_centrality(sg_real)
 
-        #TODO does it make more sense to use statistic or pvalue?
-        return stats.ks_2samp(list(syn_betweenness.values()),
-                              list(real_betweenness.values())).statistic
+    #TODO does it make more sense to use statistic or pvalue?
+    return stats.ks_2samp(list(syn_betweenness.values()),
+                            list(real_betweenness.values())).statistic
     
-@register_metric
-class outlet_nse_flow(BaseMetric):
+@metrics.register
+def outlet_nse_flow(synthetic_G: nx.Graph,
+                  synthetic_results: pd.DataFrame,
+                  real_G: nx.Graph,
+                  real_results: pd.DataFrame,
+                  real_subs: gpd.GeoDataFrame,
+                  **kwargs) -> float:
     """Outlet NSE flow.
 
     Calculate the Nash-Sutcliffe efficiency (NSE) of flow over time, where flow
@@ -124,49 +119,45 @@ class outlet_nse_flow(BaseMetric):
     dominant_outlet, while the dominant outlet node of the 'synthetic' network
     is calculated by best_outlet_match.
     """
-    def __call__(self,
-                  synthetic_G: nx.Graph,
+    # Identify synthetic and real arcs that flow into the best outlet node
+    _, syn_outlet = best_outlet_match(synthetic_G, real_subs)
+    syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True)
+                if v == syn_outlet]
+    _, real_outlet = dominant_outlet(real_G, real_results)
+    real_arc = [d['id'] for u,v,d in real_G.edges(data=True)
+                if v == real_outlet]
+    
+    # Extract flow data
+    syn_flow = synthetic_results.loc[(synthetic_results.variable == 'flow') &
+                                    (synthetic_results.object.isin(syn_arc))]
+    syn_flow = syn_flow.groupby('date').value.sum().reset_index()
+
+    real_flow = real_results.loc[(real_results.variable == 'flow') &
+                                    (real_results.object.isin(real_arc))]
+    real_flow = real_flow.groupby('date').value.sum().reset_index()
+
+    # Align data
+    df = pd.merge(syn_flow, 
+                    real_flow, 
+                    on = 'date', 
+                    suffixes = ('_syn','_real'),
+                    how = 'outer')
+    df = df.sort_values(by='date')
+
+    # Interpolate to time in real data
+    df['value_syn'] = df.set_index('date').value_syn.interpolate().values
+    df = df.dropna(subset = 'value_real')
+
+    # Calculate NSE
+    return nse(df.value_real, df.value_syn)
+
+@metrics.register
+def outlet_nse_flooding(synthetic_G: nx.Graph,
                   synthetic_results: pd.DataFrame,
                   real_G: nx.Graph,
                   real_results: pd.DataFrame,
                   real_subs: gpd.GeoDataFrame,
-                  *args,
                   **kwargs) -> float:
-        """Run the evaluated metric."""
-        # Identify synthetic and real arcs that flow into the best outlet node
-        _, syn_outlet = best_outlet_match(synthetic_G, real_subs)
-        syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True)
-                   if v == syn_outlet]
-        _, real_outlet = dominant_outlet(real_G, real_results)
-        real_arc = [d['id'] for u,v,d in real_G.edges(data=True)
-                    if v == real_outlet]
-        
-        # Extract flow data
-        syn_flow = synthetic_results.loc[(synthetic_results.variable == 'flow') &
-                                        (synthetic_results.object.isin(syn_arc))]
-        syn_flow = syn_flow.groupby('date').value.sum().reset_index()
-
-        real_flow = real_results.loc[(real_results.variable == 'flow') &
-                                        (real_results.object.isin(real_arc))]
-        real_flow = real_flow.groupby('date').value.sum().reset_index()
-
-        # Align data
-        df = pd.merge(syn_flow, 
-                      real_flow, 
-                      on = 'date', 
-                      suffixes = ('_syn','_real'),
-                      how = 'outer')
-        df = df.sort_values(by='date')
-
-        # Interpolate to time in real data
-        df['value_syn'] = df.set_index('date').value_syn.interpolate().values
-        df = df.dropna(subset = 'value_real')
-
-        # Calculate NSE
-        return nse(df.value_real, df.value_syn)
-
-@register_metric
-class outlet_nse_flooding(BaseMetric):
     """Outlet NSE flooding.
     
     Calculate the Nash-Sutcliffe efficiency (NSE) of flooding over time, where
@@ -175,44 +166,35 @@ class outlet_nse_flooding(BaseMetric):
     network is calculated by dominant_outlet, while the dominant outlet node of
     the 'synthetic' network is calculated by best_outlet_match.
     """
-    def __call__(self,
-                  synthetic_G: nx.Graph,
-                  synthetic_results: pd.DataFrame,
-                  real_G: nx.Graph,
-                  real_results: pd.DataFrame,
-                  real_subs: gpd.GeoDataFrame,
-                  *args,
-                  **kwargs) -> float:
-        """Run the evaluated metric."""
-        # Identify synthetic and real outlet arcs
-        sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
-        sg_real, _ = dominant_outlet(real_G, real_results)
-        
-        # Extract flooding data
-        syn_flood = synthetic_results.loc[(synthetic_results.variable == 'flooding') &
-                                        synthetic_results.object.isin(sg_syn.nodes)]
-        
-        real_flood = real_results.loc[(real_results.variable == 'flooding') &
-                                        real_results.object.isin(sg_real.nodes)]
-        
-        # Aggregate to total flooding over network
-        syn_flood = syn_flood.groupby('date').value.sum().reset_index()
-        real_flood = real_flood.groupby('date').value.sum().reset_index()
-
-        # Align data
-        df = pd.merge(syn_flood, 
-                      real_flood, 
-                      on = 'date', 
-                      suffixes = ('_syn','_real'),
-                      how = 'outer')
-        df = df.sort_values(by='date')
-
-        # Interpolate to time in real data
-        df['value_syn'] = df.set_index('date').value_syn.interpolate().values
-        df = df.dropna(subset='value_real')
-
-        # Calculate NSE
-        return nse(df.value_real, df.value_syn)
+    # Identify synthetic and real outlet arcs
+    sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
+    sg_real, _ = dominant_outlet(real_G, real_results)
+    
+    # Extract flooding data
+    syn_flood = synthetic_results.loc[(synthetic_results.variable == 'flooding') &
+                                    synthetic_results.object.isin(sg_syn.nodes)]
+    
+    real_flood = real_results.loc[(real_results.variable == 'flooding') &
+                                    real_results.object.isin(sg_real.nodes)]
+    
+    # Aggregate to total flooding over network
+    syn_flood = syn_flood.groupby('date').value.sum().reset_index()
+    real_flood = real_flood.groupby('date').value.sum().reset_index()
+
+    # Align data
+    df = pd.merge(syn_flood, 
+                    real_flood, 
+                    on = 'date', 
+                    suffixes = ('_syn','_real'),
+                    how = 'outer')
+    df = df.sort_values(by='date')
+
+    # Interpolate to time in real data
+    df['value_syn'] = df.set_index('date').value_syn.interpolate().values
+    df = df.dropna(subset='value_real')
+
+    # Calculate NSE
+    return nse(df.value_real, df.value_syn)
 
 def nse(y: np.ndarray,
         yhat: np.ndarray) -> float:

From 1cbd9b8ec836229bc4a8427b215831d1d32eca38 Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Wed, 6 Mar 2024 12:05:55 +0000
Subject: [PATCH 04/21] Update metric_utilities.py

revert kstest_betweenness
---
 swmmanywhere/metric_utilities.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index d57f0823..056fa029 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -84,22 +84,14 @@ def _f(x):
         return (syn_tot - real_tot) / real_tot
 
 @metrics.register
-def kstest_betweenness(
-                synthetic_G: nx.Graph,
-                real_G: nx.Graph,
-                real_subs: gpd.GeoDataFrame,
-                real_results: pd.DataFrame,
-                **kwargs) -> float:
+def kstest_betweenness( 
+                 synthetic_G: nx.Graph,
+                 real_G: nx.Graph,
+                 **kwargs) -> float:
     """KS two sided of betweenness distribution."""
-    # Identify synthetic outlet and subgraph
-    sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
+    syn_betweenness = nx.betweenness_centrality(synthetic_G)
+    real_betweenness = nx.betweenness_centrality(real_G)
     
-    # Identify real outlet and subgraph
-    sg_real, _ = dominant_outlet(real_G, real_results)
-
-    syn_betweenness = nx.betweenness_centrality(sg_syn)
-    real_betweenness = nx.betweenness_centrality(sg_real)
-
     #TODO does it make more sense to use statistic or pvalue?
     return stats.ks_2samp(list(syn_betweenness.values()),
                             list(real_betweenness.values())).statistic

From 14b2ce4ffb1f3ab21365ff92a7b6fdce1fc05f51 Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Wed, 6 Mar 2024 12:08:04 +0000
Subject: [PATCH 05/21] Update metric_utilities.py

try and make the diff tidier
---
 swmmanywhere/metric_utilities.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 056fa029..2389cb55 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -88,13 +88,13 @@ def kstest_betweenness(
                  synthetic_G: nx.Graph,
                  real_G: nx.Graph,
                  **kwargs) -> float:
-    """KS two sided of betweenness distribution."""
-    syn_betweenness = nx.betweenness_centrality(synthetic_G)
-    real_betweenness = nx.betweenness_centrality(real_G)
-    
-    #TODO does it make more sense to use statistic or pvalue?
-    return stats.ks_2samp(list(syn_betweenness.values()),
-                            list(real_betweenness.values())).statistic
+        """KS two sided of betweenness distribution."""
+        syn_betweenness = nx.betweenness_centrality(synthetic_G)
+        real_betweenness = nx.betweenness_centrality(real_G)
+        
+        #TODO does it make more sense to use statistic or pvalue?
+        return stats.ks_2samp(list(syn_betweenness.values()),
+                                list(real_betweenness.values())).statistic
     
 @metrics.register
 def outlet_nse_flow(synthetic_G: nx.Graph,

From e16dac32e5b2c2853f5578f8e63f92547f64845a Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Wed, 6 Mar 2024 13:03:44 +0000
Subject: [PATCH 06/21] Update test_metric_utilities.py

Revert kstest test
---
 tests/test_metric_utilities.py | 40 ++++++++++------------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py
index 17ac11ec..8e14d695 100644
--- a/tests/test_metric_utilities.py
+++ b/tests/test_metric_utilities.py
@@ -77,44 +77,28 @@ def test_bias_flood_depth():
                               real_subs = real_subs)
     assert np.isclose(val, -0.29523809523809524)
 
+def test_kstest_betweenness():
+    """Test the kstest_betweenness metric."""
+    G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')
+    val = mu.metrics.kstest_betweenness(synthetic_G = G, real_G = G)
+    assert val == 0.0
+
+    G_ = G.copy()
+    G_.remove_node(list(G.nodes)[0])
+    val = mu.metrics.kstest_betweenness(synthetic_G = G_, real_G = G)
+    assert np.isclose(val, 0.286231884057971)
+
 def test_best_outlet_match():
     """Test the best_outlet_match and ks_betweenness."""
     G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')
     subs = get_subs()
     
-    sg, outlet = mu.best_outlet_match(synthetic_G = G, 
+    sg, outlet = mu.metrics.best_outlet_match(synthetic_G = G, 
                                      real_subs = subs)
     outlets = nx.get_node_attributes(sg, 'outlet')
     assert len(set(outlets.values())) == 1
     assert outlet == 12354833
 
-    results = pd.DataFrame([{'object' : 4253560,
-                             'variable' : 'flow',
-                             'value' : 10},
-                            {'object' : '',
-                             'variable' : 'flow',
-                             'value' : 5}])
-
-    val = mu.metrics.kstest_betweenness(synthetic_G = G, 
-                                 real_G = G, 
-                                 real_subs = subs,
-                                 real_results = results)
-    
-    assert val == 0.0
-
-    # Move the outlet up one node
-    G_ = G.copy()
-    nx.set_node_attributes(G_,
-                           list(G_.in_edges(outlet))[0][0],
-                           'outlet')
-    G_.remove_node(outlet)
-
-    val = mu.metrics.kstest_betweenness(synthetic_G = G_,
-                                real_G = G,
-                                real_subs = subs,
-                                real_results = results)
-    assert val == 0.4
-
 def test_nse():
     """Test the nse metric."""
     val = mu.nse(y = np.array([1,2,3,4,5]),

From 6dd0e65efaa9efc4ce91e809e21653daf715d7ca Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Wed, 6 Mar 2024 13:05:33 +0000
Subject: [PATCH 07/21] Update test_metric_utilities.py

typo
---
 tests/test_metric_utilities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py
index 8e14d695..fa8be33a 100644
--- a/tests/test_metric_utilities.py
+++ b/tests/test_metric_utilities.py
@@ -93,7 +93,7 @@ def test_best_outlet_match():
     G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')
     subs = get_subs()
     
-    sg, outlet = mu.metrics.best_outlet_match(synthetic_G = G, 
+    sg, outlet = mu.best_outlet_match(synthetic_G = G, 
                                      real_subs = subs)
     outlets = nx.get_node_attributes(sg, 'outlet')
     assert len(set(outlets.values())) == 1

From d1eda73783cf9e6e99f511df7639b4d004dae19c Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Wed, 6 Mar 2024 13:32:35 +0000
Subject: [PATCH 08/21] Update metric_utilities.py

refactor
---
 swmmanywhere/metric_utilities.py | 109 +++++++++++++++----------------
 1 file changed, 54 insertions(+), 55 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 2389cb55..46e3a095 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -95,7 +95,41 @@ def kstest_betweenness(
         #TODO does it make more sense to use statistic or pvalue?
         return stats.ks_2samp(list(syn_betweenness.values()),
                                 list(real_betweenness.values())).statistic
-    
+
+def align_calc_nse(synthetic_results: pd.DataFrame, 
+                  real_results: pd.DataFrame, 
+                  variable: str, 
+                  syn_ids: list,
+                  real_ids: list) -> float:
+    """Align and calculate NSE.
+
+    Align the synthetic and real data and calculate the Nash-Sutcliffe
+    efficiency (NSE) of the variable over time. In cases where the synthetic
+    data is does not overlap the real data, the value is interpolated.
+    """
+    # Extract data
+    syn_data = extract_var(synthetic_results, variable)
+    syn_data = syn_data.loc[syn_data.object.isin(syn_ids)]
+    syn_data = syn_data.groupby('date').value.sum().reset_index()
+
+    real_data = extract_var(real_results, variable)
+    real_data = real_data.loc[real_data.object.isin(real_ids)]
+    real_data = real_data.groupby('date').value.sum().reset_index()
+
+    # Align data
+    df = pd.merge(syn_data, 
+                  real_data, 
+                  on='date', 
+                  suffixes=('_syn', '_real'), 
+                  how='outer').sort_values(by='date')
+
+    # Interpolate to time in real data
+    df['value_syn'] = df.set_index('date').value_syn.interpolate().values
+    df = df.dropna(subset=['value_real'])
+
+    # Calculate NSE
+    return nse(df.value_real, df.value_syn)
+
 @metrics.register
 def outlet_nse_flow(synthetic_G: nx.Graph,
                   synthetic_results: pd.DataFrame,
@@ -119,29 +153,12 @@ def outlet_nse_flow(synthetic_G: nx.Graph,
     real_arc = [d['id'] for u,v,d in real_G.edges(data=True)
                 if v == real_outlet]
     
-    # Extract flow data
-    syn_flow = synthetic_results.loc[(synthetic_results.variable == 'flow') &
-                                    (synthetic_results.object.isin(syn_arc))]
-    syn_flow = syn_flow.groupby('date').value.sum().reset_index()
-
-    real_flow = real_results.loc[(real_results.variable == 'flow') &
-                                    (real_results.object.isin(real_arc))]
-    real_flow = real_flow.groupby('date').value.sum().reset_index()
-
-    # Align data
-    df = pd.merge(syn_flow, 
-                    real_flow, 
-                    on = 'date', 
-                    suffixes = ('_syn','_real'),
-                    how = 'outer')
-    df = df.sort_values(by='date')
-
-    # Interpolate to time in real data
-    df['value_syn'] = df.set_index('date').value_syn.interpolate().values
-    df = df.dropna(subset = 'value_real')
+    return align_calc_nse(synthetic_results, 
+                         real_results, 
+                         'flow', 
+                         syn_arc, 
+                         real_arc)
 
-    # Calculate NSE
-    return nse(df.value_real, df.value_syn)
 
 @metrics.register
 def outlet_nse_flooding(synthetic_G: nx.Graph,
@@ -162,31 +179,11 @@ def outlet_nse_flooding(synthetic_G: nx.Graph,
     sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
     sg_real, _ = dominant_outlet(real_G, real_results)
     
-    # Extract flooding data
-    syn_flood = synthetic_results.loc[(synthetic_results.variable == 'flooding') &
-                                    synthetic_results.object.isin(sg_syn.nodes)]
-    
-    real_flood = real_results.loc[(real_results.variable == 'flooding') &
-                                    real_results.object.isin(sg_real.nodes)]
-    
-    # Aggregate to total flooding over network
-    syn_flood = syn_flood.groupby('date').value.sum().reset_index()
-    real_flood = real_flood.groupby('date').value.sum().reset_index()
-
-    # Align data
-    df = pd.merge(syn_flood, 
-                    real_flood, 
-                    on = 'date', 
-                    suffixes = ('_syn','_real'),
-                    how = 'outer')
-    df = df.sort_values(by='date')
-
-    # Interpolate to time in real data
-    df['value_syn'] = df.set_index('date').value_syn.interpolate().values
-    df = df.dropna(subset='value_real')
-
-    # Calculate NSE
-    return nse(df.value_real, df.value_syn)
+    return align_calc_nse(synthetic_results, 
+                         real_results, 
+                         'flooding', 
+                         list(sg_syn.nodes),
+                         list(sg_real.nodes))
 
 def nse(y: np.ndarray,
         yhat: np.ndarray) -> float:
@@ -212,13 +209,15 @@ def best_outlet_match(synthetic_G: nx.Graph,
     # Identify which nodes fall within real_subs
     nodes_df = pd.DataFrame([d for x,d in synthetic_G.nodes(data=True)],
                              index = synthetic_G.nodes)
-    nodes_gdf = gpd.GeoDataFrame(nodes_df, 
-                                 geometry=gpd.points_from_xy(nodes_df.x, 
-                                                             nodes_df.y),
-                                 crs = synthetic_G.graph['crs'])
-    nodes_joined = nodes_gdf.sjoin(real_subs, 
-                                   how="right", 
-                                   predicate="intersects")
+    nodes_joined = (
+        gpd.GeoDataFrame(nodes_df, 
+                         geometry=gpd.points_from_xy(nodes_df.x, 
+                                                    nodes_df.y),
+                         crs = synthetic_G.graph['crs'])
+        .sjoin(real_subs, 
+               how="right", 
+               predicate="intersects")
+    )
 
     # Select the most common outlet
     outlet = nodes_joined.outlet.value_counts().idxmax()

From d9ba7110f97ebdcec980876548fa4ce4f7c895e3 Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Wed, 6 Mar 2024 13:34:48 +0000
Subject: [PATCH 09/21] Update metric_utilities.py

move utiltiies up
---
 swmmanywhere/metric_utilities.py | 186 +++++++++++++++----------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 46e3a095..763d074d 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -59,43 +59,6 @@ def extract_var(df: pd.DataFrame,
                         df_.date.min()).dt.total_seconds()
     return df_
 
-@metrics.register
-def bias_flood_depth(
-                 synthetic_results: pd.DataFrame,
-                 real_results: pd.DataFrame,
-                 synthetic_subs: gpd.GeoDataFrame,
-                 real_subs: gpd.GeoDataFrame,
-                 **kwargs) -> float:
-        """Run the evaluated metric."""
-        
-        def _f(x):
-            return np.trapz(x.value,x.duration)
-
-        syn_flooding = extract_var(synthetic_results,
-                                    'flooding').groupby('object').apply(_f)
-        syn_area = synthetic_subs.impervious_area.sum()
-        syn_tot = syn_flooding.sum() / syn_area
-
-        real_flooding = extract_var(real_results,
-                                    'flooding').groupby('object').apply(_f)
-        real_area = real_subs.impervious_area.sum()
-        real_tot = real_flooding.sum() / real_area
-
-        return (syn_tot - real_tot) / real_tot
-
-@metrics.register
-def kstest_betweenness( 
-                 synthetic_G: nx.Graph,
-                 real_G: nx.Graph,
-                 **kwargs) -> float:
-        """KS two sided of betweenness distribution."""
-        syn_betweenness = nx.betweenness_centrality(synthetic_G)
-        real_betweenness = nx.betweenness_centrality(real_G)
-        
-        #TODO does it make more sense to use statistic or pvalue?
-        return stats.ks_2samp(list(syn_betweenness.values()),
-                                list(real_betweenness.values())).statistic
-
 def align_calc_nse(synthetic_results: pd.DataFrame, 
                   real_results: pd.DataFrame, 
                   variable: str, 
@@ -130,61 +93,6 @@ def align_calc_nse(synthetic_results: pd.DataFrame,
     # Calculate NSE
     return nse(df.value_real, df.value_syn)
 
-@metrics.register
-def outlet_nse_flow(synthetic_G: nx.Graph,
-                  synthetic_results: pd.DataFrame,
-                  real_G: nx.Graph,
-                  real_results: pd.DataFrame,
-                  real_subs: gpd.GeoDataFrame,
-                  **kwargs) -> float:
-    """Outlet NSE flow.
-
-    Calculate the Nash-Sutcliffe efficiency (NSE) of flow over time, where flow
-    is measured as the total flow of all arcs that drain to the 'dominant'
-    outlet node. The dominant outlet node of the 'real' network is calculated by
-    dominant_outlet, while the dominant outlet node of the 'synthetic' network
-    is calculated by best_outlet_match.
-    """
-    # Identify synthetic and real arcs that flow into the best outlet node
-    _, syn_outlet = best_outlet_match(synthetic_G, real_subs)
-    syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True)
-                if v == syn_outlet]
-    _, real_outlet = dominant_outlet(real_G, real_results)
-    real_arc = [d['id'] for u,v,d in real_G.edges(data=True)
-                if v == real_outlet]
-    
-    return align_calc_nse(synthetic_results, 
-                         real_results, 
-                         'flow', 
-                         syn_arc, 
-                         real_arc)
-
-
-@metrics.register
-def outlet_nse_flooding(synthetic_G: nx.Graph,
-                  synthetic_results: pd.DataFrame,
-                  real_G: nx.Graph,
-                  real_results: pd.DataFrame,
-                  real_subs: gpd.GeoDataFrame,
-                  **kwargs) -> float:
-    """Outlet NSE flooding.
-    
-    Calculate the Nash-Sutcliffe efficiency (NSE) of flooding over time, where
-    flooding is the total volume of flooded water across all nodes that drain
-    to the 'dominant' outlet node. The dominant outlet node of the 'real' 
-    network is calculated by dominant_outlet, while the dominant outlet node of
-    the 'synthetic' network is calculated by best_outlet_match.
-    """
-    # Identify synthetic and real outlet arcs
-    sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
-    sg_real, _ = dominant_outlet(real_G, real_results)
-    
-    return align_calc_nse(synthetic_results, 
-                         real_results, 
-                         'flooding', 
-                         list(sg_syn.nodes),
-                         list(sg_real.nodes))
-
 def nse(y: np.ndarray,
         yhat: np.ndarray) -> float:
     """Calculate Nash-Sutcliffe efficiency (NSE)."""
@@ -258,4 +166,96 @@ def dominant_outlet(G: nx.Graph,
     
     # Subselect the matching graph
     sg = G.subgraph(nx.ancestors(G, max_outlet) | {max_outlet})
-    return sg, max_outlet
\ No newline at end of file
+    return sg, max_outlet
+
+@metrics.register
+def bias_flood_depth(
+                 synthetic_results: pd.DataFrame,
+                 real_results: pd.DataFrame,
+                 synthetic_subs: gpd.GeoDataFrame,
+                 real_subs: gpd.GeoDataFrame,
+                 **kwargs) -> float:
+        """Run the evaluated metric."""
+        
+        def _f(x):
+            return np.trapz(x.value,x.duration)
+
+        syn_flooding = extract_var(synthetic_results,
+                                    'flooding').groupby('object').apply(_f)
+        syn_area = synthetic_subs.impervious_area.sum()
+        syn_tot = syn_flooding.sum() / syn_area
+
+        real_flooding = extract_var(real_results,
+                                    'flooding').groupby('object').apply(_f)
+        real_area = real_subs.impervious_area.sum()
+        real_tot = real_flooding.sum() / real_area
+
+        return (syn_tot - real_tot) / real_tot
+
+@metrics.register
+def kstest_betweenness( 
+                 synthetic_G: nx.Graph,
+                 real_G: nx.Graph,
+                 **kwargs) -> float:
+        """KS two sided of betweenness distribution."""
+        syn_betweenness = nx.betweenness_centrality(synthetic_G)
+        real_betweenness = nx.betweenness_centrality(real_G)
+        
+        #TODO does it make more sense to use statistic or pvalue?
+        return stats.ks_2samp(list(syn_betweenness.values()),
+                                list(real_betweenness.values())).statistic
+
+@metrics.register
+def outlet_nse_flow(synthetic_G: nx.Graph,
+                  synthetic_results: pd.DataFrame,
+                  real_G: nx.Graph,
+                  real_results: pd.DataFrame,
+                  real_subs: gpd.GeoDataFrame,
+                  **kwargs) -> float:
+    """Outlet NSE flow.
+
+    Calculate the Nash-Sutcliffe efficiency (NSE) of flow over time, where flow
+    is measured as the total flow of all arcs that drain to the 'dominant'
+    outlet node. The dominant outlet node of the 'real' network is calculated by
+    dominant_outlet, while the dominant outlet node of the 'synthetic' network
+    is calculated by best_outlet_match.
+    """
+    # Identify synthetic and real arcs that flow into the best outlet node
+    _, syn_outlet = best_outlet_match(synthetic_G, real_subs)
+    syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True)
+                if v == syn_outlet]
+    _, real_outlet = dominant_outlet(real_G, real_results)
+    real_arc = [d['id'] for u,v,d in real_G.edges(data=True)
+                if v == real_outlet]
+    
+    return align_calc_nse(synthetic_results, 
+                         real_results, 
+                         'flow', 
+                         syn_arc, 
+                         real_arc)
+
+
+@metrics.register
+def outlet_nse_flooding(synthetic_G: nx.Graph,
+                  synthetic_results: pd.DataFrame,
+                  real_G: nx.Graph,
+                  real_results: pd.DataFrame,
+                  real_subs: gpd.GeoDataFrame,
+                  **kwargs) -> float:
+    """Outlet NSE flooding.
+    
+    Calculate the Nash-Sutcliffe efficiency (NSE) of flooding over time, where
+    flooding is the total volume of flooded water across all nodes that drain
+    to the 'dominant' outlet node. The dominant outlet node of the 'real' 
+    network is calculated by dominant_outlet, while the dominant outlet node of
+    the 'synthetic' network is calculated by best_outlet_match.
+    """
+    # Identify synthetic and real outlet arcs
+    sg_syn, _ = best_outlet_match(synthetic_G, real_subs)
+    sg_real, _ = dominant_outlet(real_G, real_results)
+    
+    return align_calc_nse(synthetic_results, 
+                         real_results, 
+                         'flooding', 
+                         list(sg_syn.nodes),
+                         list(sg_real.nodes))
\ No newline at end of file

From 03fb75bc0c2c3604eaa460c2f2dd2887159ac227 Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Wed, 6 Mar 2024 13:36:24 +0000
Subject: [PATCH 10/21] Update metric_utilities.py

keep ks betweenness the same
---
 swmmanywhere/metric_utilities.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 763d074d..2e943989 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -197,13 +197,13 @@ def kstest_betweenness(
                  synthetic_G: nx.Graph,
                  real_G: nx.Graph,
                  **kwargs) -> float:
-        """KS two sided of betweenness distribution."""
+        """Run the evaluated metric."""
         syn_betweenness = nx.betweenness_centrality(synthetic_G)
         real_betweenness = nx.betweenness_centrality(real_G)
-        
+
         #TODO does it make more sense to use statistic or pvalue?
         return stats.ks_2samp(list(syn_betweenness.values()),
-                                list(real_betweenness.values())).statistic
+                              list(real_betweenness.values())).statistic
 
 @metrics.register
 def outlet_nse_flow(synthetic_G: nx.Graph,

From 2a5f9a008384b2bc94a4c0c934b342a52c27903f Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Thu, 7 Mar 2024 08:55:16 +0000
Subject: [PATCH 11/21] Update test_metric_utilities.py

---
 tests/test_metric_utilities.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py
index fa8be33a..a09ae1ed 100644
--- a/tests/test_metric_utilities.py
+++ b/tests/test_metric_utilities.py
@@ -10,6 +10,10 @@
 from swmmanywhere.graph_utilities import load_graph
 
 
+def assert_close(a: float, b: float, rtol: float = 1e-3) -> None:
+    """Assert that two floats are close."""
+    assert np.isclose(a, b, rtol=rtol).all()
+
 def get_subs():
     """Get a GeoDataFrame of subcatchments."""
     subs = [shapely.Polygon([(700262, 5709928),
@@ -75,7 +79,7 @@ def test_bias_flood_depth():
                               real_results = real_results,
                               synthetic_subs = synthetic_subs,
                               real_subs = real_subs)
-    assert np.isclose(val, -0.29523809523809524)
+    assert_close(val, -0.2952)
 
 def test_kstest_betweenness():
     """Test the kstest_betweenness metric."""
@@ -86,7 +90,7 @@ def test_kstest_betweenness():
     G_ = G.copy()
     G_.remove_node(list(G.nodes)[0])
     val = mu.metrics.kstest_betweenness(synthetic_G = G_, real_G = G)
-    assert np.isclose(val, 0.286231884057971)
+    assert_close(val, 0.2862)
 
 def test_best_outlet_match():
     """Test the best_outlet_match and ks_betweenness."""

From a99950c9368d8f715420b8d9efba20d94cc7ef5b Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Thu, 7 Mar 2024 08:57:54 +0000
Subject: [PATCH 12/21] Update metric_utilities.py

---
 swmmanywhere/metric_utilities.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 2e943989..f1859347 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -135,8 +135,8 @@ def best_outlet_match(synthetic_G: nx.Graph,
                     if d['outlet'] == outlet]
     return synthetic_G.subgraph(outlet_nodes), outlet
 
-def dominant_outlet(G: nx.Graph,
-                    results: pd.DataFrame) -> tuple[nx.Graph,int]:
+def dominant_outlet(G: nx.DiGraph,
+                    results: pd.DataFrame) -> tuple[nx.DiGraph,int]:
     """Dominant outlet.
 
     Identify the outlet with highest flow along it and return the

From 8465bb916c6ebb8635d51498cdaee1619050c701 Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Thu, 7 Mar 2024 09:27:06 +0000
Subject: [PATCH 13/21] Update metric_utilities.py

proper subgraph
values->to_numpy
---
 swmmanywhere/metric_utilities.py | 36 +++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index f1859347..9318fd10 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -87,12 +87,41 @@ def align_calc_nse(synthetic_results: pd.DataFrame,
                   how='outer').sort_values(by='date')
 
     # Interpolate to time in real data
-    df['value_syn'] = df.set_index('date').value_syn.interpolate().values
+    df['value_syn'] = df.set_index('date').value_syn.interpolate().to_numpy()
     df = df.dropna(subset=['value_real'])
 
     # Calculate NSE
     return nse(df.value_real, df.value_syn)
 
+def create_subgraph(G: nx.Graph,
+                    nodes: list) -> nx.Graph:
+    """Create a subgraph.
+    
+    Create a subgraph of G based on the nodes list. Taken from networkx 
+    documentation: https://networkx.org/documentation/stable/reference/classes/generated/networkx.Graph.subgraph.html
+    
+    Args:
+        G (nx.Graph): The original graph.
+        nodes (list): The list of nodes to include in the subgraph.
+
+    Returns:
+        nx.Graph: The subgraph.
+    """
+    # Create a subgraph SG based on a (possibly multigraph) G
+    SG = G.__class__()
+    SG.add_nodes_from((n, G.nodes[n]) for n in nodes)
+    if SG.is_multigraph():
+        SG.add_edges_from((n, nbr, key, d)
+            for n, nbrs in G.adj.items() if n in nodes
+            for nbr, keydict in nbrs.items() if nbr in nodes
+            for key, d in keydict.items())
+    else:
+        SG.add_edges_from((n, nbr, d)
+            for n, nbrs in G.adj.items() if n in nodes
+            for nbr, d in nbrs.items() if nbr in nodes)
+    SG.graph.update(G.graph)
+    return SG
+
 def nse(y: np.ndarray,
         yhat: np.ndarray) -> float:
     """Calculate Nash-Sutcliffe efficiency (NSE)."""
@@ -133,7 +162,8 @@ def best_outlet_match(synthetic_G: nx.Graph,
     # Subselect the matching graph
     outlet_nodes = [n for n, d in synthetic_G.nodes(data=True) 
                     if d['outlet'] == outlet]
-    return synthetic_G.subgraph(outlet_nodes), outlet
+    sg = create_subgraph(synthetic_G,outlet_nodes)
+    return sg, outlet
 
 def dominant_outlet(G: nx.DiGraph,
                     results: pd.DataFrame) -> tuple[nx.DiGraph,int]:
@@ -165,7 +195,7 @@ def dominant_outlet(G: nx.DiGraph,
                   if d['id'] == max_outlet_arc][0]
     
     # Subselect the matching graph
-    sg = G.subgraph(nx.ancestors(G, max_outlet) | {max_outlet})
+    sg = create_subgraph(G, nx.ancestors(G, max_outlet) | {max_outlet})
     return sg, max_outlet
 
 @metrics.register

From 3699dd6a195515cd10c74bd9af9940b93aefa182 Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Thu, 7 Mar 2024 09:30:48 +0000
Subject: [PATCH 14/21] Update metric_utilities.py

avoid intersects
---
 swmmanywhere/metric_utilities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 9318fd10..4c9d5cb0 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -153,7 +153,7 @@ def best_outlet_match(synthetic_G: nx.Graph,
                          crs = synthetic_G.graph['crs'])
         .sjoin(real_subs, 
                how="right", 
-               predicate="intersects")
+               predicate="within")
     )
 
     # Select the most common outlet

From f12a790132b0544313f7b6a25c48d9ba980cb867 Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Thu, 7 Mar 2024 10:15:35 +0000
Subject: [PATCH 15/21] Ensure align_calc_nse dates are consistent format

---
 swmmanywhere/metric_utilities.py | 17 +++++++++++------
 tests/test_metric_utilities.py   |  6 +++---
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 4c9d5cb0..98fb7926 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -70,24 +70,29 @@ def align_calc_nse(synthetic_results: pd.DataFrame,
     efficiency (NSE) of the variable over time. In cases where the synthetic
     data is does not overlap the real data, the value is interpolated.
     """
+    # Format dates
+    synthetic_results['date'] = pd.to_datetime(synthetic_results['date'])
+    real_results['date'] = pd.to_datetime(real_results['date'])
+
     # Extract data
     syn_data = extract_var(synthetic_results, variable)
     syn_data = syn_data.loc[syn_data.object.isin(syn_ids)]
-    syn_data = syn_data.groupby('date').value.sum().reset_index()
+    syn_data = syn_data.groupby('date').value.sum()
 
     real_data = extract_var(real_results, variable)
     real_data = real_data.loc[real_data.object.isin(real_ids)]
-    real_data = real_data.groupby('date').value.sum().reset_index()
-
+    real_data = real_data.groupby('date').value.sum()
+    
     # Align data
     df = pd.merge(syn_data, 
                   real_data, 
-                  on='date', 
+                  left_index = True,
+                  right_index = True,
                   suffixes=('_syn', '_real'), 
-                  how='outer').sort_values(by='date')
+                  how='outer').sort_index()
 
     # Interpolate to time in real data
-    df['value_syn'] = df.set_index('date').value_syn.interpolate().to_numpy()
+    df['value_syn'] = df.value_syn.interpolate().to_numpy()
     df = df.dropna(subset=['value_real'])
 
     # Calculate NSE
diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py
index a09ae1ed..5a72f5bb 100644
--- a/tests/test_metric_utilities.py
+++ b/tests/test_metric_utilities.py
@@ -123,11 +123,11 @@ def test_outlet_nse_flow():
     results = pd.DataFrame([{'object' : 4253560,
                              'variable' : 'flow',
                              'value' : 10,
-                             'date' : pd.to_datetime('2021-01-01 00:00:00')},
+                             'date' : pd.to_datetime('2021-01-01').date()},
                             {'object' : '',
                              'variable' : 'flow',
                              'value' : 5,
-                             'date' : pd.to_datetime('2021-01-01 00:00:00')},
+                             'date' : pd.to_datetime('2021-01-01').date()},
                              {'object' : 4253560,
                              'variable' : 'flow',
                              'value' : 5,
@@ -136,7 +136,7 @@ def test_outlet_nse_flow():
                              'variable' : 'flow',
                              'value' : 2,
                              'date' : pd.to_datetime('2021-01-01 00:00:05')}])
-    
+
     # Calculate NSE (perfect results)
     val = mu.metrics.outlet_nse_flow(synthetic_G = G,
                                     synthetic_results = results,

From 927c74bd9ae81a8bc4e9da6193e1ebb8f7e556de Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Thu, 7 Mar 2024 11:40:44 +0000
Subject: [PATCH 16/21] Add netcomp metrics

---
 dev-requirements.txt             | 15 +++----
 pyproject.toml                   |  3 +-
 requirements.txt                 | 15 +++----
 swmmanywhere/metric_utilities.py | 72 ++++++++++++++++++++++++++++++++
 tests/test_metric_utilities.py   | 18 ++++++++
 5 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/dev-requirements.txt b/dev-requirements.txt
index 066ce393..aac648e5 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -92,14 +92,9 @@ geojson==3.1.0
 geopandas==0.14.2
     # via
     #   osmnx
-    #   snkit
     #   swmmanywhere (pyproject.toml)
 geopy==2.4.1
     # via swmmanywhere (pyproject.toml)
-gitdb==4.0.11
-    # via gitpython
-gitpython==3.1.41
-    # via swmmanywhere (pyproject.toml)
 identify==2.5.33
     # via pre-commit
 idna==3.6
@@ -130,8 +125,11 @@ mypy-extensions==1.0.0
     # via mypy
 netcdf4==1.6.5
     # via swmmanywhere (pyproject.toml)
+netcomp @ git+https://github.com/barneydobson/NetComp.git
+    # via swmmanywhere (pyproject.toml)
 networkx==3.2.1
     # via
+    #   netcomp
     #   osmnx
     #   scikit-image
     #   swmmanywhere (pyproject.toml)
@@ -147,6 +145,7 @@ numpy==1.26.3
     #   imageio
     #   matplotlib
     #   netcdf4
+    #   netcomp
     #   numba
     #   osmnx
     #   pandas
@@ -259,6 +258,7 @@ scikit-image==0.22.0
     # via pysheds
 scipy==1.12.0
     # via
+    #   netcomp
     #   pysheds
     #   salib
     #   scikit-image
@@ -267,16 +267,11 @@ shapely==2.0.2
     # via
     #   geopandas
     #   osmnx
-    #   snkit
     #   swmmanywhere (pyproject.toml)
 six==1.16.0
     # via
     #   fiona
     #   python-dateutil
-smmap==5.0.1
-    # via gitdb
-snkit==1.9.0
-    # via swmmanywhere (pyproject.toml)
 snuggs==1.4.7
     # via rasterio
 swmm-toolkit==0.15.3
diff --git a/pyproject.toml b/pyproject.toml
index 673acf7d..57d1fb34 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,9 +29,9 @@ dependencies = [
   "fiona",
   "geopandas",
   "geopy",
-  "GitPython",
   "matplotlib",
   "netcdf4",
+  "netcomp@ git+https://github.com/barneydobson/NetComp.git",
   "networkx",
   "numpy",
   "osmnx",
@@ -46,7 +46,6 @@ dependencies = [
   "SALib",
   "SciPy",
   "shapely",
-  "snkit",
   "tqdm",
   "xarray",
 ]
diff --git a/requirements.txt b/requirements.txt
index cde240ba..69e76bca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -74,14 +74,9 @@ geojson==3.1.0
 geopandas==0.14.2
     # via
     #   osmnx
-    #   snkit
     #   swmmanywhere (pyproject.toml)
 geopy==2.4.1
     # via swmmanywhere (pyproject.toml)
-gitdb==4.0.11
-    # via gitpython
-gitpython==3.1.41
-    # via swmmanywhere (pyproject.toml)
 idna==3.6
     # via requests
 imageio==2.33.1
@@ -102,8 +97,11 @@ multiprocess==0.70.15
     # via salib
 netcdf4==1.6.5
     # via swmmanywhere (pyproject.toml)
+netcomp @ git+https://github.com/barneydobson/NetComp.git
+    # via swmmanywhere (pyproject.toml)
 networkx==3.2.1
     # via
+    #   netcomp
     #   osmnx
     #   scikit-image
     #   swmmanywhere (pyproject.toml)
@@ -117,6 +115,7 @@ numpy==1.26.3
     #   imageio
     #   matplotlib
     #   netcdf4
+    #   netcomp
     #   numba
     #   osmnx
     #   pandas
@@ -201,6 +200,7 @@ scikit-image==0.22.0
     # via pysheds
 scipy==1.12.0
     # via
+    #   netcomp
     #   pysheds
     #   salib
     #   scikit-image
@@ -209,16 +209,11 @@ shapely==2.0.2
     # via
     #   geopandas
     #   osmnx
-    #   snkit
     #   swmmanywhere (pyproject.toml)
 six==1.16.0
     # via
     #   fiona
     #   python-dateutil
-smmap==5.0.1
-    # via gitdb
-snkit==1.9.0
-    # via swmmanywhere (pyproject.toml)
 snuggs==1.4.7
     # via rasterio
 swmm-toolkit==0.15.3
diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 2e943989..d528cfbe 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -7,6 +7,7 @@
 from typing import Callable
 
 import geopandas as gpd
+import netcomp
 import networkx as nx
 import numpy as np
 import pandas as pd
@@ -168,6 +169,77 @@ def dominant_outlet(G: nx.Graph,
     sg = G.subgraph(nx.ancestors(G, max_outlet) | {max_outlet})
     return sg, max_outlet
 
+def nc_compare(G1, G2, funcname, **kw):
+    """Compare two graphs using netcomp."""
+    A1,A2 = [nx.adjacency_matrix(G) for G in (G1,G2)]
+    return getattr(netcomp, funcname)(A1,A2,**kw)
+
+@metrics.register
+def nc_deltacon0(synthetic_G: nx.Graph,
+                  real_G: nx.Graph,
+                  **kwargs) -> float:
+    """Run the evaluated metric."""
+    return nc_compare(synthetic_G, 
+                      real_G, 
+                      'deltacon0',
+                      eps = 1e-10)
+
+@metrics.register
+def nc_laplacian_dist(synthetic_G: nx.Graph,
+                  real_G: nx.Graph,
+                  **kwargs) -> float:
+    """Run the evaluated metric."""
+    return nc_compare(synthetic_G, 
+                      real_G, 
+                      'lambda_dist',
+                      k=10,
+                      kind = 'laplacian')
+
+@metrics.register
+def nc_laplacian_norm_dist(synthetic_G: nx.Graph,
+                  real_G: nx.Graph,
+                  **kwargs) -> float:
+    """Run the evaluated metric."""
+    return nc_compare(synthetic_G, 
+                      real_G, 
+                      'lambda_dist',
+                      k=10,
+                      kind = 'laplacian_norm')
+
+@metrics.register
+def nc_adjacency_dist(synthetic_G: nx.Graph,
+                  real_G: nx.Graph,
+                  **kwargs) -> float:
+    """Run the evaluated metric."""
+    return nc_compare(synthetic_G, 
+                      real_G, 
+                      'lambda_dist',
+                      k=10,
+                      kind = 'adjacency')
+
+@metrics.register
+def nc_vertex_edge_distance(synthetic_G: nx.Graph,
+                  real_G: nx.Graph,
+                  **kwargs) -> float:
+    """Run the evaluated metric.
+    
+    Do '1 -' because this metric is similarity not distance.
+    """
+    return 1 - nc_compare(synthetic_G, 
+                           real_G, 
+                          'vertex_edge_distance')
+
+@metrics.register
+def nc_resistance_distance(synthetic_G: nx.Graph,
+                  real_G: nx.Graph,
+                  **kwargs) -> float:
+    """Run the evaluated metric."""
+    return nc_compare(synthetic_G,
+                        real_G,
+                        'resistance_distance',
+                        check_connected = False,
+                        renormalized = True)
+
 @metrics.register
 def bias_flood_depth(
                  synthetic_results: pd.DataFrame,
diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py
index fa8be33a..c19219b5 100644
--- a/tests/test_metric_utilities.py
+++ b/tests/test_metric_utilities.py
@@ -263,3 +263,21 @@ def test_outlet_nse_flooding():
                                     real_results = results,
                                     real_subs = subs)
     assert val == 0.0
+
+def test_netcomp():
+    """Test the netcomp metrics."""
+    netcomp_results = {'nc_deltacon0' : 0.00129408,
+                       'nc_laplacian_dist' : 36.334773,
+                       'nc_laplacian_norm_dist' : 1.932007,
+                       'nc_adjacency_dist' : 3.542749,
+                       'nc_resistance_distance' : 8.098548,
+                       'nc_vertex_edge_distance' : 0.132075}
+
+    for func, val in netcomp_results.items():
+        G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')
+        val_ = getattr(mu.metrics, func)(synthetic_G = G, real_G = G)
+        assert val_ == 0.0, func
+
+        G_ = load_graph(Path(__file__).parent / 'test_data' / 'street_graph.json')
+        val_ = getattr(mu.metrics, func)(synthetic_G = G_, real_G = G)
+        assert np.isclose(val, val_), func
\ No newline at end of file

From 1c7b9550b0d9646e1df25abea78019052d3b8558 Mon Sep 17 00:00:00 2001
From: Dobson <bdobson@ic.ac.uk>
Date: Thu, 7 Mar 2024 13:43:16 +0000
Subject: [PATCH 17/21] test new metrics

---
 swmmanywhere/misc/debug_topology.py | 60 +++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 swmmanywhere/misc/debug_topology.py

diff --git a/swmmanywhere/misc/debug_topology.py b/swmmanywhere/misc/debug_topology.py
new file mode 100644
index 00000000..86034b42
--- /dev/null
+++ b/swmmanywhere/misc/debug_topology.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+"""Created 2024-03-07.
+
+@author: Barnaby Dobson
+"""
+from pathlib import Path
+from time import time
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from swmmanywhere import metric_utilities as mu
+from swmmanywhere.prepare_data import download_street
+
+# Download streets
+Gs = download_street((-0.09547,51.52196,-0.09232,51.52422))
+Gm = download_street((-0.09547,51.52196,-0.08876,51.52661))
+Gl = download_street((-0.09547,51.52196,-0.08261,51.53097))
+Gsep = download_street((-0.08261,51.53097,-0.06946,51.53718))
+
+# Define topological metrics
+metrics = ['nc_deltacon0',
+            'nc_laplacian_dist' ,
+            'nc_laplacian_norm_dist' ,
+            'nc_adjacency_dist' ,
+            'nc_resistance_distance' ,
+            'nc_vertex_edge_distance',
+            'kstest_betweenness'
+            ]
+
+# Calculate metrics
+results = []
+for G1,l1  in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']):
+    for G2,l2 in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']):
+        for func in metrics:
+            start = time()
+            val_ = getattr(mu.metrics, func)(synthetic_G = G1, real_G = G2)
+            end = time()
+            results.append({'func':func, 
+                            'val':val_, 
+                            'time':end - start, 
+                            'l1':l1, 
+                            'l2':l2})
+df = pd.DataFrame(results).sort_values(by=['l1','l2'])
+
+# Plot heatmap by func
+f,axs = plt.subplots(4,2,figsize=(10,10))
+for (func,grp),ax in zip(df.groupby('func'),axs.reshape(-1)):
+    grp = grp.pivot(index='l1',
+                columns= 'l2', 
+                values='val')
+    ax.imshow(grp,cmap='Reds')
+    ax.set_xticks(range(4))
+    ax.set_yticks(range(4))
+    ax.set_xticklabels(grp.columns,rotation=90)
+    ax.set_yticklabels(grp.index)
+    ax.set_title(func)
+plots = Path(__file__).parent
+f.tight_layout()
+f.savefig(plots / 'heatmap.png')
\ No newline at end of file

From 64d7b0397d65c3edac2401f35e0a4b92edbf45e5 Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Mon, 11 Mar 2024 14:13:35 +0000
Subject: [PATCH 18/21] BC changes

-Use taher's new function for edge betweenness
-introduce new metric for edge betweenness (in contrast to node betweenness)
-update requirements
---
 dev-requirements.txt                | 18 +++++-
 pyproject.toml                      |  2 +
 requirements.txt                    |  8 ++-
 swmmanywhere/metric_utilities.py    | 38 +++++++++++-
 swmmanywhere/misc/debug_topology.py | 89 +++++++++++++++--------------
 tests/test_metric_utilities.py      | 11 ++++
 6 files changed, 118 insertions(+), 48 deletions(-)

diff --git a/dev-requirements.txt b/dev-requirements.txt
index aac648e5..21c25d87 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.11
+# This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
 #    pip-compile --extra=dev --output-file=dev-requirements.txt
@@ -67,10 +67,14 @@ cramjam==2.7.0
     # via fastparquet
 cycler==0.12.1
     # via matplotlib
+cytoolz==0.12.3
+    # via swmmanywhere (pyproject.toml)
 dill==0.3.7
     # via multiprocess
 distlib==0.3.8
     # via virtualenv
+exceptiongroup==1.2.0
+    # via pytest
 fastparquet==2023.10.1
     # via swmmanywhere (pyproject.toml)
 filelock==3.13.1
@@ -103,6 +107,8 @@ imageio==2.33.1
     # via scikit-image
 iniconfig==2.0.0
     # via pytest
+joblib==1.3.2
+    # via swmmanywhere (pyproject.toml)
 julian==0.14
     # via pyswmm
 kiwisolver==1.4.5
@@ -278,6 +284,16 @@ swmm-toolkit==0.15.3
     # via pyswmm
 tifffile==2024.1.30
     # via scikit-image
+tomli==2.0.1
+    # via
+    #   build
+    #   coverage
+    #   mypy
+    #   pip-tools
+    #   pyproject-hooks
+    #   pytest
+toolz==0.12.1
+    # via cytoolz
 tqdm==4.66.2
     # via
     #   cdsapi
diff --git a/pyproject.toml b/pyproject.toml
index 57d1fb34..2f4f2d87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,10 +25,12 @@ classifiers = [
 dependencies = [
   # TODO definitely don't need all of these
   "cdsapi",
+  "cytoolz",
   "fastparquet",
   "fiona",
   "geopandas",
   "geopy",
+  "joblib",
   "matplotlib",
   "netcdf4",
   "netcomp@ git+https://github.com/barneydobson/NetComp.git",
diff --git a/requirements.txt b/requirements.txt
index 69e76bca..4fa086d0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.11
+# This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
 #    pip-compile
@@ -55,6 +55,8 @@ cramjam==2.7.0
     # via fastparquet
 cycler==0.12.1
     # via matplotlib
+cytoolz==0.12.3
+    # via swmmanywhere (pyproject.toml)
 dill==0.3.7
     # via multiprocess
 fastparquet==2023.10.1
@@ -81,6 +83,8 @@ idna==3.6
     # via requests
 imageio==2.33.1
     # via scikit-image
+joblib==1.3.2
+    # via swmmanywhere (pyproject.toml)
 julian==0.14
     # via pyswmm
 kiwisolver==1.4.5
@@ -220,6 +224,8 @@ swmm-toolkit==0.15.3
     # via pyswmm
 tifffile==2024.1.30
     # via scikit-image
+toolz==0.12.1
+    # via cytoolz
 tqdm==4.66.2
     # via
     #   cdsapi
diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index fbe6755e..e138ecfa 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -12,7 +12,10 @@
 import numpy as np
 import pandas as pd
 from scipy import stats
-
+import joblib
+import networkx as nx
+import cytoolz.curried as tlz
+from collections import defaultdict
 
 class MetricRegistry(dict): 
     """Registry object.""" 
@@ -209,6 +212,22 @@ def nc_compare(G1, G2, funcname, **kw):
     A1,A2 = [nx.adjacency_matrix(G) for G in (G1,G2)]
     return getattr(netcomp, funcname)(A1,A2,**kw)
 
+def edge_betweenness_centrality(G: nx.Graph, normalized: bool = True, weight: str = "weight", njobs: int = -1):
+    """Parallel betweenness centrality function"""
+    njobs = joblib.cpu_count(True) if njobs == -1 else njobs
+    node_chunks = tlz.partition_all(G.order() // njobs, G.nodes())
+    bt_func = tlz.partial(nx.edge_betweenness_centrality_subset, G=G, normalized=normalized, weight=weight)
+    bt_sc = joblib.Parallel(n_jobs=njobs)(
+        joblib.delayed(bt_func)(sources=nodes, targets=G.nodes()) for nodes in node_chunks
+    )
+
+    # Merge the betweenness centrality results
+    bt_c = defaultdict(float)
+    for bt in bt_sc:
+        for n, v in bt.items():
+            bt_c[n] += v
+    return bt_c
+
 @metrics.register
 def nc_deltacon0(synthetic_G: nx.Graph,
                   real_G: nx.Graph,
@@ -299,14 +318,27 @@ def _f(x):
 
         return (syn_tot - real_tot) / real_tot
 
+@metrics.register
+def kstest_edge_betweenness( 
+                 synthetic_G: nx.Graph,
+                 real_G: nx.Graph,
+                 **kwargs) -> float:
+        """Run the evaluated metric."""
+        syn_betweenness = edge_betweenness_centrality(synthetic_G, weight=None)
+        real_betweenness = edge_betweenness_centrality(real_G, weight=None)
+
+        #TODO does it make more sense to use statistic or pvalue?
+        return stats.ks_2samp(list(syn_betweenness.values()),
+                              list(real_betweenness.values())).statistic
+
 @metrics.register
 def kstest_betweenness( 
                  synthetic_G: nx.Graph,
                  real_G: nx.Graph,
                  **kwargs) -> float:
         """Run the evaluated metric."""
-        syn_betweenness = nx.betweenness_centrality(synthetic_G)
-        real_betweenness = nx.betweenness_centrality(real_G)
+        syn_betweenness = nx.betweenness_centrality(synthetic_G, weight=None)
+        real_betweenness = nx.betweenness_centrality(real_G, weight=None)
 
         #TODO does it make more sense to use statistic or pvalue?
         return stats.ks_2samp(list(syn_betweenness.values()),
diff --git a/swmmanywhere/misc/debug_topology.py b/swmmanywhere/misc/debug_topology.py
index 86034b42..2593779a 100644
--- a/swmmanywhere/misc/debug_topology.py
+++ b/swmmanywhere/misc/debug_topology.py
@@ -12,49 +12,52 @@
 from swmmanywhere import metric_utilities as mu
 from swmmanywhere.prepare_data import download_street
 
-# Download streets
-Gs = download_street((-0.09547,51.52196,-0.09232,51.52422))
-Gm = download_street((-0.09547,51.52196,-0.08876,51.52661))
-Gl = download_street((-0.09547,51.52196,-0.08261,51.53097))
-Gsep = download_street((-0.08261,51.53097,-0.06946,51.53718))
+if __name__ == '__main__':
+    
+    # Download streets
+    Gs = download_street((-0.09547,51.52196,-0.09232,51.52422))
+    Gm = download_street((-0.09547,51.52196,-0.08876,51.52661))
+    Gl = download_street((-0.09547,51.52196,-0.08261,51.53097))
+    Gsep = download_street((-0.08261,51.53097,-0.06946,51.53718))
 
-# Define topological metrics
-metrics = ['nc_deltacon0',
-            'nc_laplacian_dist' ,
-            'nc_laplacian_norm_dist' ,
-            'nc_adjacency_dist' ,
-            'nc_resistance_distance' ,
-            'nc_vertex_edge_distance',
-            'kstest_betweenness'
-            ]
+    # Define topological metrics
+    metrics = ['nc_deltacon0',
+                'nc_laplacian_dist' ,
+                'nc_laplacian_norm_dist' ,
+                'nc_adjacency_dist' ,
+                'nc_resistance_distance' ,
+                'nc_vertex_edge_distance',
+                'kstest_betweenness',
+                'kstest_edge_betweenness',
+                ]
 
-# Calculate metrics
-results = []
-for G1,l1  in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']):
-    for G2,l2 in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']):
-        for func in metrics:
-            start = time()
-            val_ = getattr(mu.metrics, func)(synthetic_G = G1, real_G = G2)
-            end = time()
-            results.append({'func':func, 
-                            'val':val_, 
-                            'time':end - start, 
-                            'l1':l1, 
-                            'l2':l2})
-df = pd.DataFrame(results).sort_values(by=['l1','l2'])
+    # Calculate metrics
+    results = []
+    for G1,l1  in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']):
+        for G2,l2 in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']):
+            for func in metrics:
+                start = time()
+                val_ = getattr(mu.metrics, func)(synthetic_G = G1, real_G = G2)
+                end = time()
+                results.append({'func':func, 
+                                'val':val_, 
+                                'time':end - start, 
+                                'l1':l1, 
+                                'l2':l2})
+    df = pd.DataFrame(results).sort_values(by=['l1','l2'])
 
-# Plot heatmap by func
-f,axs = plt.subplots(4,2,figsize=(10,10))
-for (func,grp),ax in zip(df.groupby('func'),axs.reshape(-1)):
-    grp = grp.pivot(index='l1',
-                columns= 'l2', 
-                values='val')
-    ax.imshow(grp,cmap='Reds')
-    ax.set_xticks(range(4))
-    ax.set_yticks(range(4))
-    ax.set_xticklabels(grp.columns,rotation=90)
-    ax.set_yticklabels(grp.index)
-    ax.set_title(func)
-plots = Path(__file__).parent
-f.tight_layout()
-f.savefig(plots / 'heatmap.png')
\ No newline at end of file
+    # Plot heatmap by func
+    f,axs = plt.subplots(4,2,figsize=(10,10))
+    for (func,grp),ax in zip(df.groupby('func'),axs.reshape(-1)):
+        grp = grp.pivot(index='l1',
+                    columns= 'l2', 
+                    values='val')
+        ax.imshow(grp,cmap='Reds')
+        ax.set_xticks(range(4))
+        ax.set_yticks(range(4))
+        ax.set_xticklabels(grp.columns,rotation=90)
+        ax.set_yticklabels(grp.index)
+        ax.set_title(func)
+    plots = Path(__file__).parent
+    f.tight_layout()
+    f.savefig(plots / 'heatmap.png')
\ No newline at end of file
diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py
index 978c61d0..5918f06c 100644
--- a/tests/test_metric_utilities.py
+++ b/tests/test_metric_utilities.py
@@ -92,6 +92,17 @@ def test_kstest_betweenness():
     val = mu.metrics.kstest_betweenness(synthetic_G = G_, real_G = G)
     assert_close(val, 0.2862)
 
+def test_kstest_edge_betweenness():
+    """Test the kstest_betweenness metric."""
+    G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')
+    val = mu.metrics.kstest_edge_betweenness(synthetic_G = G, real_G = G)
+    assert val == 0.0
+
+    G_ = G.copy()
+    G_.remove_node(list(G.nodes)[0])
+    val = mu.metrics.kstest_edge_betweenness(synthetic_G = G_, real_G = G)
+    assert_close(val, 0.38995)
+
 def test_best_outlet_match():
     """Test the best_outlet_match and ks_betweenness."""
     G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json')

From eb5a6019c51897bed0e765c52c5da92f923de0d9 Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Mon, 11 Mar 2024 14:19:19 +0000
Subject: [PATCH 19/21] Update metric_utilities.py

formatting
---
 swmmanywhere/metric_utilities.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index e138ecfa..84f651fc 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -3,19 +3,19 @@
 
 @author: Barnaby Dobson
 """
+from collections import defaultdict
 from inspect import signature
 from typing import Callable
 
+import cytoolz.curried as tlz
 import geopandas as gpd
+import joblib
 import netcomp
 import networkx as nx
 import numpy as np
 import pandas as pd
 from scipy import stats
-import joblib
-import networkx as nx
-import cytoolz.curried as tlz
-from collections import defaultdict
+
 
 class MetricRegistry(dict): 
     """Registry object.""" 
@@ -212,13 +212,20 @@ def nc_compare(G1, G2, funcname, **kw):
     A1,A2 = [nx.adjacency_matrix(G) for G in (G1,G2)]
     return getattr(netcomp, funcname)(A1,A2,**kw)
 
-def edge_betweenness_centrality(G: nx.Graph, normalized: bool = True, weight: str = "weight", njobs: int = -1):
-    """Parallel betweenness centrality function"""
+def edge_betweenness_centrality(G: nx.Graph, 
+                                normalized: bool = True,
+                                weight: str = "weight", 
+                                njobs: int = -1):
+    """Parallel betweenness centrality function."""
     njobs = joblib.cpu_count(True) if njobs == -1 else njobs
     node_chunks = tlz.partition_all(G.order() // njobs, G.nodes())
-    bt_func = tlz.partial(nx.edge_betweenness_centrality_subset, G=G, normalized=normalized, weight=weight)
+    bt_func = tlz.partial(nx.edge_betweenness_centrality_subset, 
+                          G=G, 
+                          normalized=normalized, 
+                          weight=weight)
     bt_sc = joblib.Parallel(n_jobs=njobs)(
-        joblib.delayed(bt_func)(sources=nodes, targets=G.nodes()) for nodes in node_chunks
+        joblib.delayed(bt_func)(sources=nodes, 
+                                targets=G.nodes()) for nodes in node_chunks
     )
 
     # Merge the betweenness centrality results

From bdcbd8b376fb6f1f0013a9ea394e6f8b36ca7483 Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Mon, 11 Mar 2024 14:21:15 +0000
Subject: [PATCH 20/21] Update pyproject.toml

formatting
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index abd90bcf..a3c4e1a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,8 +30,8 @@ dependencies = [
   "fiona",
   "geopandas",
   "geopy",
-  "joblib",
   "GitPython",
+  "joblib",
   "loguru",
   "matplotlib",
   "netcdf4",

From 4cf2675b79b1975b2ac7a55b55ae551bba8d39d9 Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Mon, 11 Mar 2024 14:27:59 +0000
Subject: [PATCH 21/21] Update metric_utilities.py

more formatting...
---
 swmmanywhere/metric_utilities.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py
index 84f651fc..77b76eae 100644
--- a/swmmanywhere/metric_utilities.py
+++ b/swmmanywhere/metric_utilities.py
@@ -5,7 +5,7 @@
 """
 from collections import defaultdict
 from inspect import signature
-from typing import Callable
+from typing import Callable, Optional
 
 import cytoolz.curried as tlz
 import geopandas as gpd
@@ -214,7 +214,7 @@ def nc_compare(G1, G2, funcname, **kw):
 
 def edge_betweenness_centrality(G: nx.Graph, 
                                 normalized: bool = True,
-                                weight: str = "weight", 
+                                weight: Optional[str] = "weight", 
                                 njobs: int = -1):
     """Parallel betweenness centrality function."""
     njobs = joblib.cpu_count(True) if njobs == -1 else njobs
@@ -229,7 +229,7 @@ def edge_betweenness_centrality(G: nx.Graph,
     )
 
     # Merge the betweenness centrality results
-    bt_c = defaultdict(float)
+    bt_c: dict[int, float] = defaultdict(float)
     for bt in bt_sc:
         for n, v in bt.items():
             bt_c[n] += v