diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index a01c4842..942d6ba0 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -96,12 +96,88 @@ class kstest_betweenness(BaseMetric): def __call__(self, synthetic_G: nx.Graph, real_G: nx.Graph, + real_subs: gpd.GeoDataFrame, + real_results: pd.DataFrame, *args, **kwargs) -> float: """Run the evaluated metric.""" - syn_betweenness = nx.betweenness_centrality(synthetic_G) - real_betweenness = nx.betweenness_centrality(real_G) + # Identify synthetic outlet and subgraph + sg_syn, _ = best_outlet_match(synthetic_G, real_subs) + + # Identify real outlet and subgraph + sg_real, _ = dominant_outlet(real_G, real_results) + + syn_betweenness = nx.betweenness_centrality(sg_syn) + real_betweenness = nx.betweenness_centrality(sg_real) #TODO does it make more sense to use statistic or pvalue? return stats.ks_2samp(list(syn_betweenness.values()), - list(real_betweenness.values())).statistic \ No newline at end of file + list(real_betweenness.values())).statistic + +def best_outlet_match(synthetic_G: nx.Graph, + real_subs: gpd.GeoDataFrame) -> tuple[nx.Graph,int]: + """Best outlet match. + + Identify the outlet with the most nodes within the real_subs and return the + subgraph of the synthetic graph of nodes that drain to that outlet. + + Args: + synthetic_G (nx.Graph): The synthetic graph. + real_subs (gpd.GeoDataFrame): The real subcatchments. + + Returns: + nx.Graph: The subgraph of the synthetic graph for the outlet with the + most nodes within the real_subs. + int: The id of the outlet. + """ + # Identify which nodes fall within real_subs + nodes_df = pd.DataFrame([d for x,d in synthetic_G.nodes(data=True)], + index = synthetic_G.nodes) + nodes_gdf = gpd.GeoDataFrame(nodes_df, + geometry=gpd.points_from_xy(nodes_df.x, + nodes_df.y), + crs = synthetic_G.graph['crs']) + nodes_joined = nodes_gdf.sjoin(real_subs, + how="right", + predicate="intersects") + + # Select the most common outlet + outlet = nodes_joined.outlet.value_counts().idxmax() + + # Subselect the matching graph + outlet_nodes = [n for n, d in synthetic_G.nodes(data=True) + if d['outlet'] == outlet] + return synthetic_G.subgraph(outlet_nodes), outlet + +def dominant_outlet(G: nx.Graph, + results: pd.DataFrame) -> tuple[nx.Graph,int]: + """Dominant outlet. + + Identify the outlet with highest flow along it and return the + subgraph of the graph of nodes that drain to that outlet. + + Args: + G (nx.Graph): The graph. + results (pd.DataFrame): The results, which include a 'flow' and 'id' + column. + + Returns: + nx.Graph: The subgraph of nodes/arcs that the reach max flow outlet + int: The id of the outlet. + """ + # Identify outlets of the graph + outlets = [n for n, outdegree in G.out_degree() + if outdegree == 0] + outlet_arcs = [d['id'] for u,v,d in G.edges(data=True) + if v in outlets] + + # Identify the outlet with the highest flow + outlet_flows = results.loc[(results.variable == 'flow') & + (results.object.isin(outlet_arcs))] + max_outlet_arc = outlet_flows.groupby('object').value.mean().idxmax() + max_outlet = [v for u,v,d in G.edges(data=True) + if d['id'] == max_outlet_arc][0] + + # Subselect the matching graph + sg = G.subgraph(nx.ancestors(G, max_outlet) | {max_outlet}) + return sg, max_outlet \ No newline at end of file diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py index 5cb599ab..b4368363 100644 --- a/tests/test_metric_utilities.py +++ b/tests/test_metric_utilities.py @@ -1,9 +1,12 @@ from pathlib import Path +import geopandas as gpd +import networkx as nx import pandas as pd +import shapely +from swmmanywhere import metric_utilities as mu from swmmanywhere.graph_utilities import load_graph -from swmmanywhere.metric_utilities import metrics as sm def test_bias_flood_depth(): @@ -31,19 +34,77 @@ def test_bias_flood_depth(): }) # Run the metric - val = sm.bias_flood_depth(synthetic_results = synthetic_results, + val = mu.metrics.bias_flood_depth(synthetic_results = synthetic_results, real_results = real_results, synthetic_subs = synthetic_subs, real_subs = real_subs) assert val == -0.29523809523809524 -def test_kstest_betweenness(): - """Test the kstest_betweenness metric.""" +def test_best_outlet_match(): + """Test the best_outlet_match and ks_betweenness.""" G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') - val = sm.kstest_betweenness(synthetic_G = G, real_G = G) + subs = [shapely.Polygon([(700262, 5709928), + (700262, 5709883), + (700351, 5709883), + (700351, 5709906), + (700306, 5709906), + (700306, 5709928), + (700262, 5709928)]), + shapely.Polygon([(700306, 5709928), + (700284, 5709928), + (700284, 5709950), + (700374, 5709950), + (700374, 5709906), + (700351, 5709906), + (700306, 5709906), + (700306, 5709928)]), + shapely.Polygon([(700351, 5709883), + (700351, 5709906), + (700374, 5709906), + (700374, 5709883), + (700396, 5709883), + (700396, 5709816), + (700329, 5709816), + (700329, 5709838), + (700329, 5709883), + (700351, 5709883)])] + + subs = gpd.GeoDataFrame(data = {'id' : [107733, + 1696030874, + 6277683849] + }, + geometry = subs, + crs = G.graph['crs']) + + sg, outlet = mu.best_outlet_match(synthetic_G = G, + real_subs = subs) + outlets = nx.get_node_attributes(sg, 'outlet') + assert len(set(outlets.values())) == 1 + assert outlet == 12354833 + + results = pd.DataFrame([{'object' : 4253560, + 'variable' : 'flow', + 'value' : 10}, + {'object' : '', + 'variable' : 'flow', + 'value' : 5}]) + + val = mu.metrics.kstest_betweenness(synthetic_G = G, + real_G = G, + real_subs = subs, + real_results = results) + assert val == 0.0 + # Move the outlet up one node G_ = G.copy() - G_.remove_node(list(G.nodes)[0]) - val = sm.kstest_betweenness(synthetic_G = G_, real_G = G) - assert val == 0.286231884057971 \ No newline at end of file + nx.set_node_attributes(G_, + list(G_.in_edges(outlet))[0][0], + 'outlet') + G_.remove_node(outlet) + + val = mu.metrics.kstest_betweenness(synthetic_G = G_, + real_G = G, + real_subs = subs, + real_results = results) + assert val == 0.4