From 44f304f731759fb37a6391c8aa448f1917eb49aa Mon Sep 17 00:00:00 2001 From: Dobson Date: Thu, 29 Feb 2024 11:41:12 +0000 Subject: [PATCH 01/21] Implement outlet matching -Update and test KStest --- swmmanywhere/metric_utilities.py | 82 ++++++++++++++++++++++++++++++-- tests/test_metric_utilities.py | 77 ++++++++++++++++++++++++++---- 2 files changed, 148 insertions(+), 11 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index a01c4842..942d6ba0 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -96,12 +96,88 @@ class kstest_betweenness(BaseMetric): def __call__(self, synthetic_G: nx.Graph, real_G: nx.Graph, + real_subs: gpd.GeoDataFrame, + real_results: pd.DataFrame, *args, **kwargs) -> float: """Run the evaluated metric.""" - syn_betweenness = nx.betweenness_centrality(synthetic_G) - real_betweenness = nx.betweenness_centrality(real_G) + # Identify synthetic outlet and subgraph + sg_syn, _ = best_outlet_match(synthetic_G, real_subs) + + # Identify real outlet and subgraph + sg_real, _ = dominant_outlet(real_G, real_results) + + syn_betweenness = nx.betweenness_centrality(sg_syn) + real_betweenness = nx.betweenness_centrality(sg_real) #TODO does it make more sense to use statistic or pvalue? return stats.ks_2samp(list(syn_betweenness.values()), - list(real_betweenness.values())).statistic \ No newline at end of file + list(real_betweenness.values())).statistic + +def best_outlet_match(synthetic_G: nx.Graph, + real_subs: gpd.GeoDataFrame) -> tuple[nx.Graph,int]: + """Best outlet match. + + Identify the outlet with the most nodes within the real_subs and return the + subgraph of the synthetic graph of nodes that drain to that outlet. + + Args: + synthetic_G (nx.Graph): The synthetic graph. + real_subs (gpd.GeoDataFrame): The real subcatchments. + + Returns: + nx.Graph: The subgraph of the synthetic graph for the outlet with the + most nodes within the real_subs. + int: The id of the outlet. + """ + # Identify which nodes fall within real_subs + nodes_df = pd.DataFrame([d for x,d in synthetic_G.nodes(data=True)], + index = synthetic_G.nodes) + nodes_gdf = gpd.GeoDataFrame(nodes_df, + geometry=gpd.points_from_xy(nodes_df.x, + nodes_df.y), + crs = synthetic_G.graph['crs']) + nodes_joined = nodes_gdf.sjoin(real_subs, + how="right", + predicate="intersects") + + # Select the most common outlet + outlet = nodes_joined.outlet.value_counts().idxmax() + + # Subselect the matching graph + outlet_nodes = [n for n, d in synthetic_G.nodes(data=True) + if d['outlet'] == outlet] + return synthetic_G.subgraph(outlet_nodes), outlet + +def dominant_outlet(G: nx.Graph, + results: pd.DataFrame) -> tuple[nx.Graph,int]: + """Dominant outlet. + + Identify the outlet with highest flow along it and return the + subgraph of the graph of nodes that drain to that outlet. + + Args: + G (nx.Graph): The graph. + results (pd.DataFrame): The results, which include a 'flow' and 'id' + column. + + Returns: + nx.Graph: The subgraph of nodes/arcs that the reach max flow outlet + int: The id of the outlet. + """ + # Identify outlets of the graph + outlets = [n for n, outdegree in G.out_degree() + if outdegree == 0] + outlet_arcs = [d['id'] for u,v,d in G.edges(data=True) + if v in outlets] + + # Identify the outlet with the highest flow + outlet_flows = results.loc[(results.variable == 'flow') & + (results.object.isin(outlet_arcs))] + max_outlet_arc = outlet_flows.groupby('object').value.mean().idxmax() + max_outlet = [v for u,v,d in G.edges(data=True) + if d['id'] == max_outlet_arc][0] + + # Subselect the matching graph + sg = G.subgraph(nx.ancestors(G, max_outlet) | {max_outlet}) + return sg, max_outlet \ No newline at end of file diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py index 5cb599ab..b4368363 100644 --- a/tests/test_metric_utilities.py +++ b/tests/test_metric_utilities.py @@ -1,9 +1,12 @@ from pathlib import Path +import geopandas as gpd +import networkx as nx import pandas as pd +import shapely +from swmmanywhere import metric_utilities as mu from swmmanywhere.graph_utilities import load_graph -from swmmanywhere.metric_utilities import metrics as sm def test_bias_flood_depth(): @@ -31,19 +34,77 @@ def test_bias_flood_depth(): }) # Run the metric - val = sm.bias_flood_depth(synthetic_results = synthetic_results, + val = mu.metrics.bias_flood_depth(synthetic_results = synthetic_results, real_results = real_results, synthetic_subs = synthetic_subs, real_subs = real_subs) assert val == -0.29523809523809524 -def test_kstest_betweenness(): - """Test the kstest_betweenness metric.""" +def test_best_outlet_match(): + """Test the best_outlet_match and ks_betweenness.""" G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') - val = sm.kstest_betweenness(synthetic_G = G, real_G = G) + subs = [shapely.Polygon([(700262, 5709928), + (700262, 5709883), + (700351, 5709883), + (700351, 5709906), + (700306, 5709906), + (700306, 5709928), + (700262, 5709928)]), + shapely.Polygon([(700306, 5709928), + (700284, 5709928), + (700284, 5709950), + (700374, 5709950), + (700374, 5709906), + (700351, 5709906), + (700306, 5709906), + (700306, 5709928)]), + shapely.Polygon([(700351, 5709883), + (700351, 5709906), + (700374, 5709906), + (700374, 5709883), + (700396, 5709883), + (700396, 5709816), + (700329, 5709816), + (700329, 5709838), + (700329, 5709883), + (700351, 5709883)])] + + subs = gpd.GeoDataFrame(data = {'id' : [107733, + 1696030874, + 6277683849] + }, + geometry = subs, + crs = G.graph['crs']) + + sg, outlet = mu.best_outlet_match(synthetic_G = G, + real_subs = subs) + outlets = nx.get_node_attributes(sg, 'outlet') + assert len(set(outlets.values())) == 1 + assert outlet == 12354833 + + results = pd.DataFrame([{'object' : 4253560, + 'variable' : 'flow', + 'value' : 10}, + {'object' : '', + 'variable' : 'flow', + 'value' : 5}]) + + val = mu.metrics.kstest_betweenness(synthetic_G = G, + real_G = G, + real_subs = subs, + real_results = results) + assert val == 0.0 + # Move the outlet up one node G_ = G.copy() - G_.remove_node(list(G.nodes)[0]) - val = sm.kstest_betweenness(synthetic_G = G_, real_G = G) - assert val == 0.286231884057971 \ No newline at end of file + nx.set_node_attributes(G_, + list(G_.in_edges(outlet))[0][0], + 'outlet') + G_.remove_node(outlet) + + val = mu.metrics.kstest_betweenness(synthetic_G = G_, + real_G = G, + real_subs = subs, + real_results = results) + assert val == 0.4 From 1c829e9592d2165713720a14be009cf2f0e8ef74 Mon Sep 17 00:00:00 2001 From: Dobson Date: Thu, 29 Feb 2024 16:21:33 +0000 Subject: [PATCH 02/21] Implement outlet matching -Update and test KStest -Add and test flow/flooding for a 'dominant' outlet --- swmmanywhere/metric_utilities.py | 105 ++++++++++++++ tests/test_metric_utilities.py | 235 ++++++++++++++++++++++++++----- 2 files changed, 308 insertions(+), 32 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 942d6ba0..8c2da18d 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -113,6 +113,111 @@ def __call__(self, #TODO does it make more sense to use statistic or pvalue? return stats.ks_2samp(list(syn_betweenness.values()), list(real_betweenness.values())).statistic + +@register_metric +class outlet_nse_flow(BaseMetric): + """Outlet NSE flow. + + Calculate the Nash-Sutcliffe efficiency (NSE) of flow over time, where flow + is measured as the total flow of all arcs that drain to the 'dominant' + outlet node. The dominant outlet node of the 'real' network is calculated by + dominant_outlet, while the dominant outlet node of the 'synthetic' network + is calculated by best_outlet_match. + """ + def __call__(self, + synthetic_G: nx.Graph, + synthetic_results: pd.DataFrame, + real_G: nx.Graph, + real_results: pd.DataFrame, + real_subs: gpd.GeoDataFrame, + *args, + **kwargs) -> float: + """Run the evaluated metric.""" + # Identify synthetic and real arcs that flow into the best outlet node + _, syn_outlet = best_outlet_match(synthetic_G, real_subs) + syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True) + if v == syn_outlet] + _, real_outlet = dominant_outlet(real_G, real_results) + real_arc = [d['id'] for u,v,d in real_G.edges(data=True) + if v == real_outlet] + + # Extract flow data + syn_flow = synthetic_results.loc[(synthetic_results.variable == 'flow') & + (synthetic_results.object.isin(syn_arc))] + syn_flow = syn_flow.groupby('date').value.sum().reset_index() + + real_flow = real_results.loc[(real_results.variable == 'flow') & + (real_results.object.isin(real_arc))] + real_flow = real_flow.groupby('date').value.sum().reset_index() + + # Align data + df = pd.merge(syn_flow, + real_flow, + on = 'date', + suffixes = ('_syn','_real'), + how = 'outer') + df = df.sort_values(by='date') + + # Interpolate to time in real data + df['value_syn'] = df.set_index('date').value_syn.interpolate().values + df = df.dropna(subset = 'value_real') + + # Calculate NSE + return nse(df.value_real, df.value_syn) + +@register_metric +class outlet_nse_flooding(BaseMetric): + """Outlet NSE flooding. + + Calculate the Nash-Sutcliffe efficiency (NSE) of flooding over time, where + flooding is the total volume of flooded water across all nodes that drain + to the 'dominant' outlet node. The dominant outlet node of the 'real' + network is calculated by dominant_outlet, while the dominant outlet node of + the 'synthetic' network is calculated by best_outlet_match. + """ + def __call__(self, + synthetic_G: nx.Graph, + synthetic_results: pd.DataFrame, + real_G: nx.Graph, + real_results: pd.DataFrame, + real_subs: gpd.GeoDataFrame, + *args, + **kwargs) -> float: + """Run the evaluated metric.""" + # Identify synthetic and real outlet arcs + sg_syn, _ = best_outlet_match(synthetic_G, real_subs) + sg_real, _ = dominant_outlet(real_G, real_results) + + # Extract flooding data + syn_flood = synthetic_results.loc[(synthetic_results.variable == 'flooding') & + synthetic_results.object.isin(sg_syn.nodes)] + + real_flood = real_results.loc[(real_results.variable == 'flooding') & + real_results.object.isin(sg_real.nodes)] + + # Aggregate to total flooding over network + syn_flood = syn_flood.groupby('date').value.sum().reset_index() + real_flood = real_flood.groupby('date').value.sum().reset_index() + + # Align data + df = pd.merge(syn_flood, + real_flood, + on = 'date', + suffixes = ('_syn','_real'), + how = 'outer') + df = df.sort_values(by='date') + + # Interpolate to time in real data + df['value_syn'] = df.set_index('date').value_syn.interpolate().values + df = df.dropna(subset='value_real') + + # Calculate NSE + return nse(df.value_real, df.value_syn) + +def nse(y: np.ndarray, + yhat: np.ndarray) -> float: + """Calculate Nash-Sutcliffe efficiency (NSE).""" + return 1 - np.sum((y - yhat)**2) / np.sum((y - np.mean(y))**2) def best_outlet_match(synthetic_G: nx.Graph, real_subs: gpd.GeoDataFrame) -> tuple[nx.Graph,int]: diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py index b4368363..4efd921e 100644 --- a/tests/test_metric_utilities.py +++ b/tests/test_metric_utilities.py @@ -2,6 +2,7 @@ import geopandas as gpd import networkx as nx +import numpy as np import pandas as pd import shapely @@ -9,6 +10,42 @@ from swmmanywhere.graph_utilities import load_graph +def get_subs(): + """Get a GeoDataFrame of subcatchments.""" + subs = [shapely.Polygon([(700262, 5709928), + (700262, 5709883), + (700351, 5709883), + (700351, 5709906), + (700306, 5709906), + (700306, 5709928), + (700262, 5709928)]), + shapely.Polygon([(700306, 5709928), + (700284, 5709928), + (700284, 5709950), + (700374, 5709950), + (700374, 5709906), + (700351, 5709906), + (700306, 5709906), + (700306, 5709928)]), + shapely.Polygon([(700351, 5709883), + (700351, 5709906), + (700374, 5709906), + (700374, 5709883), + (700396, 5709883), + (700396, 5709816), + (700329, 5709816), + (700329, 5709838), + (700329, 5709883), + (700351, 5709883)])] + + subs = gpd.GeoDataFrame(data = {'id' : [107733, + 1696030874, + 6277683849] + }, + geometry = subs, + crs = 'EPSG:32630') + return subs + def test_bias_flood_depth(): """Test the bias_flood_depth metric.""" # Create synthetic and real data @@ -43,38 +80,7 @@ def test_bias_flood_depth(): def test_best_outlet_match(): """Test the best_outlet_match and ks_betweenness.""" G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') - subs = [shapely.Polygon([(700262, 5709928), - (700262, 5709883), - (700351, 5709883), - (700351, 5709906), - (700306, 5709906), - (700306, 5709928), - (700262, 5709928)]), - shapely.Polygon([(700306, 5709928), - (700284, 5709928), - (700284, 5709950), - (700374, 5709950), - (700374, 5709906), - (700351, 5709906), - (700306, 5709906), - (700306, 5709928)]), - shapely.Polygon([(700351, 5709883), - (700351, 5709906), - (700374, 5709906), - (700374, 5709883), - (700396, 5709883), - (700396, 5709816), - (700329, 5709816), - (700329, 5709838), - (700329, 5709883), - (700351, 5709883)])] - - subs = gpd.GeoDataFrame(data = {'id' : [107733, - 1696030874, - 6277683849] - }, - geometry = subs, - crs = G.graph['crs']) + subs = get_subs() sg, outlet = mu.best_outlet_match(synthetic_G = G, real_subs = subs) @@ -108,3 +114,168 @@ def test_best_outlet_match(): real_subs = subs, real_results = results) assert val == 0.4 + +def test_nse(): + """Test the nse metric.""" + val = mu.nse(y = np.array([1,2,3,4,5]), + yhat = np.array([1,2,3,4,5])) + assert val == 1.0 + + val = mu.nse(y = np.array([1,2,3,4,5]), + yhat = np.array([3,3,3,3,3])) + assert val == 0.0 + +def test_outlet_nse_flow(): + """Test the outlet_nse_flow metric.""" + # Load data + G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') + subs = get_subs() + + # Mock results + results = pd.DataFrame([{'object' : 4253560, + 'variable' : 'flow', + 'value' : 10, + 'date' : pd.to_datetime('2021-01-01 00:00:00')}, + {'object' : '', + 'variable' : 'flow', + 'value' : 5, + 'date' : pd.to_datetime('2021-01-01 00:00:00')}, + {'object' : 4253560, + 'variable' : 'flow', + 'value' : 5, + 'date' : pd.to_datetime('2021-01-01 00:00:05')}, + {'object' : '', + 'variable' : 'flow', + 'value' : 2, + 'date' : pd.to_datetime('2021-01-01 00:00:05')}]) + + # Calculate NSE (perfect results) + val = mu.metrics.outlet_nse_flow(synthetic_G = G, + synthetic_results = results, + real_G = G, + real_results = results, + real_subs = subs) + assert val == 1.0 + + # Calculate NSE (mean results) + results_ = results.copy() + results_.loc[[0,2],'value'] = 7.5 + val = mu.metrics.outlet_nse_flow(synthetic_G = G, + synthetic_results = results_, + real_G = G, + real_results = results, + real_subs = subs) + assert val == 0.0 + + # Change the graph + G_ = G.copy() + new_outlet = list(G_.in_edges(12354833))[0][0] + nx.set_node_attributes(G_, + new_outlet, + 'outlet') + G_.remove_node(12354833) + results_.loc[results_.object == 4253560, 'object'] = 725226531 + + # Calculate NSE (mean results) + val = mu.metrics.outlet_nse_flow(synthetic_G = G_, + synthetic_results = results_, + real_G = G, + real_results = results, + real_subs = subs) + assert val == 0.0 + + # Test time interpolation + results_.loc[2,'date'] = pd.to_datetime('2021-01-01 00:00:10') + results_.loc[[0,2], 'value'] = [0,30] + val = mu.metrics.outlet_nse_flow(synthetic_G = G_, + synthetic_results = results_, + real_G = G, + real_results = results, + real_subs = subs) + assert val == -15.0 + +def test_outlet_nse_flooding(): + """Test the outlet_nse_flow metric.""" + # Load data + G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') + subs = get_subs() + + # Mock results + results = pd.DataFrame([{'object' : 4253560, + 'variable' : 'flow', + 'value' : 10, + 'date' : pd.to_datetime('2021-01-01 00:00:00')}, + {'object' : 4253560, + 'variable' : 'flow', + 'value' : 5, + 'date' : pd.to_datetime('2021-01-01 00:00:05')}, + {'object' : 25472468, + 'variable' : 'flooding', + 'value' : 4.5, + 'date' : pd.to_datetime('2021-01-01 00:00:00')}, + {'object' : 770549936, + 'variable' : 'flooding', + 'value' : 5, + 'date' : pd.to_datetime('2021-01-01 00:00:00')}, + {'object' : 109753, + 'variable' : 'flooding', + 'value' : 10, + 'date' : pd.to_datetime('2021-01-01 00:00:00')}, + {'object' : 25472468, + 'variable' : 'flooding', + 'value' : 0, + 'date' : pd.to_datetime('2021-01-01 00:00:05')}, + {'object' : 770549936, + 'variable' : 'flooding', + 'value' : 5, + 'date' : pd.to_datetime('2021-01-01 00:00:05')}, + {'object' : 109753, + 'variable' : 'flooding', + 'value' : 15, + 'date' : pd.to_datetime('2021-01-01 00:00:05')}]) + + # Calculate NSE (perfect results) + val = mu.metrics.outlet_nse_flooding(synthetic_G = G, + synthetic_results = results, + real_G = G, + real_results = results, + real_subs = subs) + assert val == 1.0 + + # Calculate NSE (mean results) + results_ = results.copy() + results_.loc[results_.object.isin([770549936, 25472468]),'value'] = [14.5 / 4] * 4 + val = mu.metrics.outlet_nse_flooding(synthetic_G = G, + synthetic_results = results_, + real_G = G, + real_results = results, + real_subs = subs) + assert val == 0.0 + + # Change the graph + G_ = G.copy() + new_outlet = list(G_.in_edges(12354833))[0][0] + nx.set_node_attributes(G_, + {x : new_outlet for x,d in G_.nodes(data = True) + if d['outlet'] == 12354833}, + 'outlet') + G_.remove_node(12354833) + + # Calculate NSE (mean results) + val = mu.metrics.outlet_nse_flooding(synthetic_G = G_, + synthetic_results = results_, + real_G = G, + real_results = results, + real_subs = subs) + assert val == 0.0 + + # Test time interpolation + results_.loc[results_.date == pd.to_datetime('2021-01-01 00:00:05'), + 'date'] = pd.to_datetime('2021-01-01 00:00:10') + + val = mu.metrics.outlet_nse_flooding(synthetic_G = G_, + synthetic_results = results_, + real_G = G, + real_results = results, + real_subs = subs) + assert val == 0.0 \ No newline at end of file From d8bcd0da9aafe25d7895dde3d985191507db741e Mon Sep 17 00:00:00 2001 From: Dobson Date: Wed, 6 Mar 2024 11:45:50 +0000 Subject: [PATCH 03/21] Update metric_utilities.py update to merge with `metric_format` --- swmmanywhere/metric_utilities.py | 244 ++++++++++++++----------------- 1 file changed, 113 insertions(+), 131 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 8c2da18d..d57f0823 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -3,7 +3,7 @@ @author: Barnaby Dobson """ -from abc import ABC, abstractmethod +from inspect import signature from typing import Callable import geopandas as gpd @@ -13,25 +13,33 @@ from scipy import stats -class BaseMetric(ABC): - """Base metric class.""" - @abstractmethod - def __call__(self, - *args, - **kwargs) -> float: - """Run the evaluated metric.""" - return 0 - class MetricRegistry(dict): """Registry object.""" - def register(self, cls): + def register(self, func: Callable) -> Callable: """Register a metric.""" - if cls.__name__ in self: - raise ValueError(f"{cls.__name__} already in the metric registry!") - - self[cls.__name__] = cls() - return cls + if func.__name__ in self: + raise ValueError(f"{func.__name__} already in the metric registry!") + + allowable_params = {"synthetic_results": pd.DataFrame, + "real_results": pd.DataFrame, + "synthetic_subs": gpd.GeoDataFrame, + "real_subs": gpd.GeoDataFrame, + "synthetic_G": nx.Graph, + "real_G": nx.Graph} + + sig = signature(func) + for param, obj in sig.parameters.items(): + if param == 'kwargs': + continue + if param not in allowable_params: + raise ValueError(f"{param} of {func.__name__} not allowed.") + if obj.annotation != allowable_params[param]: + raise ValueError(f"""{param} of {func.__name__} should be of + type {allowable_params[param]}, not + {obj.__class__}.""") + self[func.__name__] = func + return func def __getattr__(self, name): """Get a metric from the graphfcn dict.""" @@ -43,18 +51,6 @@ def __getattr__(self, name): metrics = MetricRegistry() -def register_metric(cls) -> Callable: - """Register a metric. - - Args: - cls (Callable): A class that inherits from BaseMetric - - Returns: - cls (Callable): The same class - """ - metrics.register(cls) - return cls - def extract_var(df: pd.DataFrame, var: str) -> pd.DataFrame: """Extract var from a dataframe.""" @@ -63,15 +59,12 @@ def extract_var(df: pd.DataFrame, df_.date.min()).dt.total_seconds() return df_ -@register_metric -class bias_flood_depth(BaseMetric): - """Bias flood depth.""" - def __call__(self, +@metrics.register +def bias_flood_depth( synthetic_results: pd.DataFrame, real_results: pd.DataFrame, synthetic_subs: gpd.GeoDataFrame, real_subs: gpd.GeoDataFrame, - *args, **kwargs) -> float: """Run the evaluated metric.""" @@ -90,32 +83,34 @@ def _f(x): return (syn_tot - real_tot) / real_tot -@register_metric -class kstest_betweenness(BaseMetric): +@metrics.register +def kstest_betweenness( + synthetic_G: nx.Graph, + real_G: nx.Graph, + real_subs: gpd.GeoDataFrame, + real_results: pd.DataFrame, + **kwargs) -> float: """KS two sided of betweenness distribution.""" - def __call__(self, - synthetic_G: nx.Graph, - real_G: nx.Graph, - real_subs: gpd.GeoDataFrame, - real_results: pd.DataFrame, - *args, - **kwargs) -> float: - """Run the evaluated metric.""" - # Identify synthetic outlet and subgraph - sg_syn, _ = best_outlet_match(synthetic_G, real_subs) - - # Identify real outlet and subgraph - sg_real, _ = dominant_outlet(real_G, real_results) + # Identify synthetic outlet and subgraph + sg_syn, _ = best_outlet_match(synthetic_G, real_subs) + + # Identify real outlet and subgraph + sg_real, _ = dominant_outlet(real_G, real_results) - syn_betweenness = nx.betweenness_centrality(sg_syn) - real_betweenness = nx.betweenness_centrality(sg_real) + syn_betweenness = nx.betweenness_centrality(sg_syn) + real_betweenness = nx.betweenness_centrality(sg_real) - #TODO does it make more sense to use statistic or pvalue? - return stats.ks_2samp(list(syn_betweenness.values()), - list(real_betweenness.values())).statistic + #TODO does it make more sense to use statistic or pvalue? + return stats.ks_2samp(list(syn_betweenness.values()), + list(real_betweenness.values())).statistic -@register_metric -class outlet_nse_flow(BaseMetric): +@metrics.register +def outlet_nse_flow(synthetic_G: nx.Graph, + synthetic_results: pd.DataFrame, + real_G: nx.Graph, + real_results: pd.DataFrame, + real_subs: gpd.GeoDataFrame, + **kwargs) -> float: """Outlet NSE flow. Calculate the Nash-Sutcliffe efficiency (NSE) of flow over time, where flow @@ -124,49 +119,45 @@ class outlet_nse_flow(BaseMetric): dominant_outlet, while the dominant outlet node of the 'synthetic' network is calculated by best_outlet_match. """ - def __call__(self, - synthetic_G: nx.Graph, + # Identify synthetic and real arcs that flow into the best outlet node + _, syn_outlet = best_outlet_match(synthetic_G, real_subs) + syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True) + if v == syn_outlet] + _, real_outlet = dominant_outlet(real_G, real_results) + real_arc = [d['id'] for u,v,d in real_G.edges(data=True) + if v == real_outlet] + + # Extract flow data + syn_flow = synthetic_results.loc[(synthetic_results.variable == 'flow') & + (synthetic_results.object.isin(syn_arc))] + syn_flow = syn_flow.groupby('date').value.sum().reset_index() + + real_flow = real_results.loc[(real_results.variable == 'flow') & + (real_results.object.isin(real_arc))] + real_flow = real_flow.groupby('date').value.sum().reset_index() + + # Align data + df = pd.merge(syn_flow, + real_flow, + on = 'date', + suffixes = ('_syn','_real'), + how = 'outer') + df = df.sort_values(by='date') + + # Interpolate to time in real data + df['value_syn'] = df.set_index('date').value_syn.interpolate().values + df = df.dropna(subset = 'value_real') + + # Calculate NSE + return nse(df.value_real, df.value_syn) + +@metrics.register +def outlet_nse_flooding(synthetic_G: nx.Graph, synthetic_results: pd.DataFrame, real_G: nx.Graph, real_results: pd.DataFrame, real_subs: gpd.GeoDataFrame, - *args, **kwargs) -> float: - """Run the evaluated metric.""" - # Identify synthetic and real arcs that flow into the best outlet node - _, syn_outlet = best_outlet_match(synthetic_G, real_subs) - syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True) - if v == syn_outlet] - _, real_outlet = dominant_outlet(real_G, real_results) - real_arc = [d['id'] for u,v,d in real_G.edges(data=True) - if v == real_outlet] - - # Extract flow data - syn_flow = synthetic_results.loc[(synthetic_results.variable == 'flow') & - (synthetic_results.object.isin(syn_arc))] - syn_flow = syn_flow.groupby('date').value.sum().reset_index() - - real_flow = real_results.loc[(real_results.variable == 'flow') & - (real_results.object.isin(real_arc))] - real_flow = real_flow.groupby('date').value.sum().reset_index() - - # Align data - df = pd.merge(syn_flow, - real_flow, - on = 'date', - suffixes = ('_syn','_real'), - how = 'outer') - df = df.sort_values(by='date') - - # Interpolate to time in real data - df['value_syn'] = df.set_index('date').value_syn.interpolate().values - df = df.dropna(subset = 'value_real') - - # Calculate NSE - return nse(df.value_real, df.value_syn) - -@register_metric -class outlet_nse_flooding(BaseMetric): """Outlet NSE flooding. Calculate the Nash-Sutcliffe efficiency (NSE) of flooding over time, where @@ -175,44 +166,35 @@ class outlet_nse_flooding(BaseMetric): network is calculated by dominant_outlet, while the dominant outlet node of the 'synthetic' network is calculated by best_outlet_match. """ - def __call__(self, - synthetic_G: nx.Graph, - synthetic_results: pd.DataFrame, - real_G: nx.Graph, - real_results: pd.DataFrame, - real_subs: gpd.GeoDataFrame, - *args, - **kwargs) -> float: - """Run the evaluated metric.""" - # Identify synthetic and real outlet arcs - sg_syn, _ = best_outlet_match(synthetic_G, real_subs) - sg_real, _ = dominant_outlet(real_G, real_results) - - # Extract flooding data - syn_flood = synthetic_results.loc[(synthetic_results.variable == 'flooding') & - synthetic_results.object.isin(sg_syn.nodes)] - - real_flood = real_results.loc[(real_results.variable == 'flooding') & - real_results.object.isin(sg_real.nodes)] - - # Aggregate to total flooding over network - syn_flood = syn_flood.groupby('date').value.sum().reset_index() - real_flood = real_flood.groupby('date').value.sum().reset_index() - - # Align data - df = pd.merge(syn_flood, - real_flood, - on = 'date', - suffixes = ('_syn','_real'), - how = 'outer') - df = df.sort_values(by='date') - - # Interpolate to time in real data - df['value_syn'] = df.set_index('date').value_syn.interpolate().values - df = df.dropna(subset='value_real') - - # Calculate NSE - return nse(df.value_real, df.value_syn) + # Identify synthetic and real outlet arcs + sg_syn, _ = best_outlet_match(synthetic_G, real_subs) + sg_real, _ = dominant_outlet(real_G, real_results) + + # Extract flooding data + syn_flood = synthetic_results.loc[(synthetic_results.variable == 'flooding') & + synthetic_results.object.isin(sg_syn.nodes)] + + real_flood = real_results.loc[(real_results.variable == 'flooding') & + real_results.object.isin(sg_real.nodes)] + + # Aggregate to total flooding over network + syn_flood = syn_flood.groupby('date').value.sum().reset_index() + real_flood = real_flood.groupby('date').value.sum().reset_index() + + # Align data + df = pd.merge(syn_flood, + real_flood, + on = 'date', + suffixes = ('_syn','_real'), + how = 'outer') + df = df.sort_values(by='date') + + # Interpolate to time in real data + df['value_syn'] = df.set_index('date').value_syn.interpolate().values + df = df.dropna(subset='value_real') + + # Calculate NSE + return nse(df.value_real, df.value_syn) def nse(y: np.ndarray, yhat: np.ndarray) -> float: From 1cbd9b8ec836229bc4a8427b215831d1d32eca38 Mon Sep 17 00:00:00 2001 From: Dobson Date: Wed, 6 Mar 2024 12:05:55 +0000 Subject: [PATCH 04/21] Update metric_utilities.py revert kstest_betweenness --- swmmanywhere/metric_utilities.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index d57f0823..056fa029 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -84,22 +84,14 @@ def _f(x): return (syn_tot - real_tot) / real_tot @metrics.register -def kstest_betweenness( - synthetic_G: nx.Graph, - real_G: nx.Graph, - real_subs: gpd.GeoDataFrame, - real_results: pd.DataFrame, - **kwargs) -> float: +def kstest_betweenness( + synthetic_G: nx.Graph, + real_G: nx.Graph, + **kwargs) -> float: """KS two sided of betweenness distribution.""" - # Identify synthetic outlet and subgraph - sg_syn, _ = best_outlet_match(synthetic_G, real_subs) + syn_betweenness = nx.betweenness_centrality(synthetic_G) + real_betweenness = nx.betweenness_centrality(real_G) - # Identify real outlet and subgraph - sg_real, _ = dominant_outlet(real_G, real_results) - - syn_betweenness = nx.betweenness_centrality(sg_syn) - real_betweenness = nx.betweenness_centrality(sg_real) - #TODO does it make more sense to use statistic or pvalue? return stats.ks_2samp(list(syn_betweenness.values()), list(real_betweenness.values())).statistic From 14b2ce4ffb1f3ab21365ff92a7b6fdce1fc05f51 Mon Sep 17 00:00:00 2001 From: Dobson Date: Wed, 6 Mar 2024 12:08:04 +0000 Subject: [PATCH 05/21] Update metric_utilities.py try and make the diff tidier --- swmmanywhere/metric_utilities.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 056fa029..2389cb55 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -88,13 +88,13 @@ def kstest_betweenness( synthetic_G: nx.Graph, real_G: nx.Graph, **kwargs) -> float: - """KS two sided of betweenness distribution.""" - syn_betweenness = nx.betweenness_centrality(synthetic_G) - real_betweenness = nx.betweenness_centrality(real_G) - - #TODO does it make more sense to use statistic or pvalue? - return stats.ks_2samp(list(syn_betweenness.values()), - list(real_betweenness.values())).statistic + """KS two sided of betweenness distribution.""" + syn_betweenness = nx.betweenness_centrality(synthetic_G) + real_betweenness = nx.betweenness_centrality(real_G) + + #TODO does it make more sense to use statistic or pvalue? + return stats.ks_2samp(list(syn_betweenness.values()), + list(real_betweenness.values())).statistic @metrics.register def outlet_nse_flow(synthetic_G: nx.Graph, From e16dac32e5b2c2853f5578f8e63f92547f64845a Mon Sep 17 00:00:00 2001 From: Dobson Date: Wed, 6 Mar 2024 13:03:44 +0000 Subject: [PATCH 06/21] Update test_metric_utilities.py Revert kstest test --- tests/test_metric_utilities.py | 40 ++++++++++------------------------ 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py index 17ac11ec..8e14d695 100644 --- a/tests/test_metric_utilities.py +++ b/tests/test_metric_utilities.py @@ -77,44 +77,28 @@ def test_bias_flood_depth(): real_subs = real_subs) assert np.isclose(val, -0.29523809523809524) +def test_kstest_betweenness(): + """Test the kstest_betweenness metric.""" + G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') + val = mu.metrics.kstest_betweenness(synthetic_G = G, real_G = G) + assert val == 0.0 + + G_ = G.copy() + G_.remove_node(list(G.nodes)[0]) + val = mu.metrics.kstest_betweenness(synthetic_G = G_, real_G = G) + assert np.isclose(val, 0.286231884057971) + def test_best_outlet_match(): """Test the best_outlet_match and ks_betweenness.""" G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') subs = get_subs() - sg, outlet = mu.best_outlet_match(synthetic_G = G, + sg, outlet = mu.metrics.best_outlet_match(synthetic_G = G, real_subs = subs) outlets = nx.get_node_attributes(sg, 'outlet') assert len(set(outlets.values())) == 1 assert outlet == 12354833 - results = pd.DataFrame([{'object' : 4253560, - 'variable' : 'flow', - 'value' : 10}, - {'object' : '', - 'variable' : 'flow', - 'value' : 5}]) - - val = mu.metrics.kstest_betweenness(synthetic_G = G, - real_G = G, - real_subs = subs, - real_results = results) - - assert val == 0.0 - - # Move the outlet up one node - G_ = G.copy() - nx.set_node_attributes(G_, - list(G_.in_edges(outlet))[0][0], - 'outlet') - G_.remove_node(outlet) - - val = mu.metrics.kstest_betweenness(synthetic_G = G_, - real_G = G, - real_subs = subs, - real_results = results) - assert val == 0.4 - def test_nse(): """Test the nse metric.""" val = mu.nse(y = np.array([1,2,3,4,5]), From 6dd0e65efaa9efc4ce91e809e21653daf715d7ca Mon Sep 17 00:00:00 2001 From: Dobson Date: Wed, 6 Mar 2024 13:05:33 +0000 Subject: [PATCH 07/21] Update test_metric_utilities.py typo --- tests/test_metric_utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py index 8e14d695..fa8be33a 100644 --- a/tests/test_metric_utilities.py +++ b/tests/test_metric_utilities.py @@ -93,7 +93,7 @@ def test_best_outlet_match(): G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') subs = get_subs() - sg, outlet = mu.metrics.best_outlet_match(synthetic_G = G, + sg, outlet = mu.best_outlet_match(synthetic_G = G, real_subs = subs) outlets = nx.get_node_attributes(sg, 'outlet') assert len(set(outlets.values())) == 1 From d1eda73783cf9e6e99f511df7639b4d004dae19c Mon Sep 17 00:00:00 2001 From: Dobson Date: Wed, 6 Mar 2024 13:32:35 +0000 Subject: [PATCH 08/21] Update metric_utilities.py refactor --- swmmanywhere/metric_utilities.py | 109 +++++++++++++++---------------- 1 file changed, 54 insertions(+), 55 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 2389cb55..46e3a095 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -95,7 +95,41 @@ def kstest_betweenness( #TODO does it make more sense to use statistic or pvalue? return stats.ks_2samp(list(syn_betweenness.values()), list(real_betweenness.values())).statistic - + +def align_calc_nse(synthetic_results: pd.DataFrame, + real_results: pd.DataFrame, + variable: str, + syn_ids: list, + real_ids: list) -> float: + """Align and calculate NSE. + + Align the synthetic and real data and calculate the Nash-Sutcliffe + efficiency (NSE) of the variable over time. In cases where the synthetic + data is does not overlap the real data, the value is interpolated. + """ + # Extract data + syn_data = extract_var(synthetic_results, variable) + syn_data = syn_data.loc[syn_data.object.isin(syn_ids)] + syn_data = syn_data.groupby('date').value.sum().reset_index() + + real_data = extract_var(real_results, variable) + real_data = real_data.loc[real_data.object.isin(real_ids)] + real_data = real_data.groupby('date').value.sum().reset_index() + + # Align data + df = pd.merge(syn_data, + real_data, + on='date', + suffixes=('_syn', '_real'), + how='outer').sort_values(by='date') + + # Interpolate to time in real data + df['value_syn'] = df.set_index('date').value_syn.interpolate().values + df = df.dropna(subset=['value_real']) + + # Calculate NSE + return nse(df.value_real, df.value_syn) + @metrics.register def outlet_nse_flow(synthetic_G: nx.Graph, synthetic_results: pd.DataFrame, @@ -119,29 +153,12 @@ def outlet_nse_flow(synthetic_G: nx.Graph, real_arc = [d['id'] for u,v,d in real_G.edges(data=True) if v == real_outlet] - # Extract flow data - syn_flow = synthetic_results.loc[(synthetic_results.variable == 'flow') & - (synthetic_results.object.isin(syn_arc))] - syn_flow = syn_flow.groupby('date').value.sum().reset_index() - - real_flow = real_results.loc[(real_results.variable == 'flow') & - (real_results.object.isin(real_arc))] - real_flow = real_flow.groupby('date').value.sum().reset_index() - - # Align data - df = pd.merge(syn_flow, - real_flow, - on = 'date', - suffixes = ('_syn','_real'), - how = 'outer') - df = df.sort_values(by='date') - - # Interpolate to time in real data - df['value_syn'] = df.set_index('date').value_syn.interpolate().values - df = df.dropna(subset = 'value_real') + return align_calc_nse(synthetic_results, + real_results, + 'flow', + syn_arc, + real_arc) - # Calculate NSE - return nse(df.value_real, df.value_syn) @metrics.register def outlet_nse_flooding(synthetic_G: nx.Graph, @@ -162,31 +179,11 @@ def outlet_nse_flooding(synthetic_G: nx.Graph, sg_syn, _ = best_outlet_match(synthetic_G, real_subs) sg_real, _ = dominant_outlet(real_G, real_results) - # Extract flooding data - syn_flood = synthetic_results.loc[(synthetic_results.variable == 'flooding') & - synthetic_results.object.isin(sg_syn.nodes)] - - real_flood = real_results.loc[(real_results.variable == 'flooding') & - real_results.object.isin(sg_real.nodes)] - - # Aggregate to total flooding over network - syn_flood = syn_flood.groupby('date').value.sum().reset_index() - real_flood = real_flood.groupby('date').value.sum().reset_index() - - # Align data - df = pd.merge(syn_flood, - real_flood, - on = 'date', - suffixes = ('_syn','_real'), - how = 'outer') - df = df.sort_values(by='date') - - # Interpolate to time in real data - df['value_syn'] = df.set_index('date').value_syn.interpolate().values - df = df.dropna(subset='value_real') - - # Calculate NSE - return nse(df.value_real, df.value_syn) + return align_calc_nse(synthetic_results, + real_results, + 'flooding', + list(sg_syn.nodes), + list(sg_real.nodes)) def nse(y: np.ndarray, yhat: np.ndarray) -> float: @@ -212,13 +209,15 @@ def best_outlet_match(synthetic_G: nx.Graph, # Identify which nodes fall within real_subs nodes_df = pd.DataFrame([d for x,d in synthetic_G.nodes(data=True)], index = synthetic_G.nodes) - nodes_gdf = gpd.GeoDataFrame(nodes_df, - geometry=gpd.points_from_xy(nodes_df.x, - nodes_df.y), - crs = synthetic_G.graph['crs']) - nodes_joined = nodes_gdf.sjoin(real_subs, - how="right", - predicate="intersects") + nodes_joined = ( + gpd.GeoDataFrame(nodes_df, + geometry=gpd.points_from_xy(nodes_df.x, + nodes_df.y), + crs = synthetic_G.graph['crs']) + .sjoin(real_subs, + how="right", + predicate="intersects") + ) # Select the most common outlet outlet = nodes_joined.outlet.value_counts().idxmax() From d9ba7110f97ebdcec980876548fa4ce4f7c895e3 Mon Sep 17 00:00:00 2001 From: Dobson Date: Wed, 6 Mar 2024 13:34:48 +0000 Subject: [PATCH 09/21] Update metric_utilities.py move utiltiies up --- swmmanywhere/metric_utilities.py | 186 +++++++++++++++---------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 46e3a095..763d074d 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -59,43 +59,6 @@ def extract_var(df: pd.DataFrame, df_.date.min()).dt.total_seconds() return df_ -@metrics.register -def bias_flood_depth( - synthetic_results: pd.DataFrame, - real_results: pd.DataFrame, - synthetic_subs: gpd.GeoDataFrame, - real_subs: gpd.GeoDataFrame, - **kwargs) -> float: - """Run the evaluated metric.""" - - def _f(x): - return np.trapz(x.value,x.duration) - - syn_flooding = extract_var(synthetic_results, - 'flooding').groupby('object').apply(_f) - syn_area = synthetic_subs.impervious_area.sum() - syn_tot = syn_flooding.sum() / syn_area - - real_flooding = extract_var(real_results, - 'flooding').groupby('object').apply(_f) - real_area = real_subs.impervious_area.sum() - real_tot = real_flooding.sum() / real_area - - return (syn_tot - real_tot) / real_tot - -@metrics.register -def kstest_betweenness( - synthetic_G: nx.Graph, - real_G: nx.Graph, - **kwargs) -> float: - """KS two sided of betweenness distribution.""" - syn_betweenness = nx.betweenness_centrality(synthetic_G) - real_betweenness = nx.betweenness_centrality(real_G) - - #TODO does it make more sense to use statistic or pvalue? - return stats.ks_2samp(list(syn_betweenness.values()), - list(real_betweenness.values())).statistic - def align_calc_nse(synthetic_results: pd.DataFrame, real_results: pd.DataFrame, variable: str, @@ -130,61 +93,6 @@ def align_calc_nse(synthetic_results: pd.DataFrame, # Calculate NSE return nse(df.value_real, df.value_syn) -@metrics.register -def outlet_nse_flow(synthetic_G: nx.Graph, - synthetic_results: pd.DataFrame, - real_G: nx.Graph, - real_results: pd.DataFrame, - real_subs: gpd.GeoDataFrame, - **kwargs) -> float: - """Outlet NSE flow. - - Calculate the Nash-Sutcliffe efficiency (NSE) of flow over time, where flow - is measured as the total flow of all arcs that drain to the 'dominant' - outlet node. The dominant outlet node of the 'real' network is calculated by - dominant_outlet, while the dominant outlet node of the 'synthetic' network - is calculated by best_outlet_match. - """ - # Identify synthetic and real arcs that flow into the best outlet node - _, syn_outlet = best_outlet_match(synthetic_G, real_subs) - syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True) - if v == syn_outlet] - _, real_outlet = dominant_outlet(real_G, real_results) - real_arc = [d['id'] for u,v,d in real_G.edges(data=True) - if v == real_outlet] - - return align_calc_nse(synthetic_results, - real_results, - 'flow', - syn_arc, - real_arc) - - -@metrics.register -def outlet_nse_flooding(synthetic_G: nx.Graph, - synthetic_results: pd.DataFrame, - real_G: nx.Graph, - real_results: pd.DataFrame, - real_subs: gpd.GeoDataFrame, - **kwargs) -> float: - """Outlet NSE flooding. - - Calculate the Nash-Sutcliffe efficiency (NSE) of flooding over time, where - flooding is the total volume of flooded water across all nodes that drain - to the 'dominant' outlet node. The dominant outlet node of the 'real' - network is calculated by dominant_outlet, while the dominant outlet node of - the 'synthetic' network is calculated by best_outlet_match. - """ - # Identify synthetic and real outlet arcs - sg_syn, _ = best_outlet_match(synthetic_G, real_subs) - sg_real, _ = dominant_outlet(real_G, real_results) - - return align_calc_nse(synthetic_results, - real_results, - 'flooding', - list(sg_syn.nodes), - list(sg_real.nodes)) - def nse(y: np.ndarray, yhat: np.ndarray) -> float: """Calculate Nash-Sutcliffe efficiency (NSE).""" @@ -258,4 +166,96 @@ def dominant_outlet(G: nx.Graph, # Subselect the matching graph sg = G.subgraph(nx.ancestors(G, max_outlet) | {max_outlet}) - return sg, max_outlet \ No newline at end of file + return sg, max_outlet + +@metrics.register +def bias_flood_depth( + synthetic_results: pd.DataFrame, + real_results: pd.DataFrame, + synthetic_subs: gpd.GeoDataFrame, + real_subs: gpd.GeoDataFrame, + **kwargs) -> float: + """Run the evaluated metric.""" + + def _f(x): + return np.trapz(x.value,x.duration) + + syn_flooding = extract_var(synthetic_results, + 'flooding').groupby('object').apply(_f) + syn_area = synthetic_subs.impervious_area.sum() + syn_tot = syn_flooding.sum() / syn_area + + real_flooding = extract_var(real_results, + 'flooding').groupby('object').apply(_f) + real_area = real_subs.impervious_area.sum() + real_tot = real_flooding.sum() / real_area + + return (syn_tot - real_tot) / real_tot + +@metrics.register +def kstest_betweenness( + synthetic_G: nx.Graph, + real_G: nx.Graph, + **kwargs) -> float: + """KS two sided of betweenness distribution.""" + syn_betweenness = nx.betweenness_centrality(synthetic_G) + real_betweenness = nx.betweenness_centrality(real_G) + + #TODO does it make more sense to use statistic or pvalue? + return stats.ks_2samp(list(syn_betweenness.values()), + list(real_betweenness.values())).statistic + +@metrics.register +def outlet_nse_flow(synthetic_G: nx.Graph, + synthetic_results: pd.DataFrame, + real_G: nx.Graph, + real_results: pd.DataFrame, + real_subs: gpd.GeoDataFrame, + **kwargs) -> float: + """Outlet NSE flow. + + Calculate the Nash-Sutcliffe efficiency (NSE) of flow over time, where flow + is measured as the total flow of all arcs that drain to the 'dominant' + outlet node. The dominant outlet node of the 'real' network is calculated by + dominant_outlet, while the dominant outlet node of the 'synthetic' network + is calculated by best_outlet_match. + """ + # Identify synthetic and real arcs that flow into the best outlet node + _, syn_outlet = best_outlet_match(synthetic_G, real_subs) + syn_arc = [d['id'] for u,v,d in synthetic_G.edges(data=True) + if v == syn_outlet] + _, real_outlet = dominant_outlet(real_G, real_results) + real_arc = [d['id'] for u,v,d in real_G.edges(data=True) + if v == real_outlet] + + return align_calc_nse(synthetic_results, + real_results, + 'flow', + syn_arc, + real_arc) + + +@metrics.register +def outlet_nse_flooding(synthetic_G: nx.Graph, + synthetic_results: pd.DataFrame, + real_G: nx.Graph, + real_results: pd.DataFrame, + real_subs: gpd.GeoDataFrame, + **kwargs) -> float: + """Outlet NSE flooding. + + Calculate the Nash-Sutcliffe efficiency (NSE) of flooding over time, where + flooding is the total volume of flooded water across all nodes that drain + to the 'dominant' outlet node. The dominant outlet node of the 'real' + network is calculated by dominant_outlet, while the dominant outlet node of + the 'synthetic' network is calculated by best_outlet_match. + """ + # Identify synthetic and real outlet arcs + sg_syn, _ = best_outlet_match(synthetic_G, real_subs) + sg_real, _ = dominant_outlet(real_G, real_results) + + return align_calc_nse(synthetic_results, + real_results, + 'flooding', + list(sg_syn.nodes), + list(sg_real.nodes)) \ No newline at end of file From 03fb75bc0c2c3604eaa460c2f2dd2887159ac227 Mon Sep 17 00:00:00 2001 From: Dobson Date: Wed, 6 Mar 2024 13:36:24 +0000 Subject: [PATCH 10/21] Update metric_utilities.py keep ks betweenness the same --- swmmanywhere/metric_utilities.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 763d074d..2e943989 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -197,13 +197,13 @@ def kstest_betweenness( synthetic_G: nx.Graph, real_G: nx.Graph, **kwargs) -> float: - """KS two sided of betweenness distribution.""" + """Run the evaluated metric.""" syn_betweenness = nx.betweenness_centrality(synthetic_G) real_betweenness = nx.betweenness_centrality(real_G) - + #TODO does it make more sense to use statistic or pvalue? return stats.ks_2samp(list(syn_betweenness.values()), - list(real_betweenness.values())).statistic + list(real_betweenness.values())).statistic @metrics.register def outlet_nse_flow(synthetic_G: nx.Graph, From 2a5f9a008384b2bc94a4c0c934b342a52c27903f Mon Sep 17 00:00:00 2001 From: barneydobson Date: Thu, 7 Mar 2024 08:55:16 +0000 Subject: [PATCH 11/21] Update test_metric_utilities.py --- tests/test_metric_utilities.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py index fa8be33a..a09ae1ed 100644 --- a/tests/test_metric_utilities.py +++ b/tests/test_metric_utilities.py @@ -10,6 +10,10 @@ from swmmanywhere.graph_utilities import load_graph +def assert_close(a: float, b: float, rtol: float = 1e-3) -> None: + """Assert that two floats are close.""" + assert np.isclose(a, b, rtol=rtol).all() + def get_subs(): """Get a GeoDataFrame of subcatchments.""" subs = [shapely.Polygon([(700262, 5709928), @@ -75,7 +79,7 @@ def test_bias_flood_depth(): real_results = real_results, synthetic_subs = synthetic_subs, real_subs = real_subs) - assert np.isclose(val, -0.29523809523809524) + assert_close(val, -0.2952) def test_kstest_betweenness(): """Test the kstest_betweenness metric.""" @@ -86,7 +90,7 @@ def test_kstest_betweenness(): G_ = G.copy() G_.remove_node(list(G.nodes)[0]) val = mu.metrics.kstest_betweenness(synthetic_G = G_, real_G = G) - assert np.isclose(val, 0.286231884057971) + assert_close(val, 0.2862) def test_best_outlet_match(): """Test the best_outlet_match and ks_betweenness.""" From a99950c9368d8f715420b8d9efba20d94cc7ef5b Mon Sep 17 00:00:00 2001 From: barneydobson Date: Thu, 7 Mar 2024 08:57:54 +0000 Subject: [PATCH 12/21] Update metric_utilities.py --- swmmanywhere/metric_utilities.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 2e943989..f1859347 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -135,8 +135,8 @@ def best_outlet_match(synthetic_G: nx.Graph, if d['outlet'] == outlet] return synthetic_G.subgraph(outlet_nodes), outlet -def dominant_outlet(G: nx.Graph, - results: pd.DataFrame) -> tuple[nx.Graph,int]: +def dominant_outlet(G: nx.DiGraph, + results: pd.DataFrame) -> tuple[nx.DiGraph,int]: """Dominant outlet. Identify the outlet with highest flow along it and return the From 8465bb916c6ebb8635d51498cdaee1619050c701 Mon Sep 17 00:00:00 2001 From: barneydobson Date: Thu, 7 Mar 2024 09:27:06 +0000 Subject: [PATCH 13/21] Update metric_utilities.py proper subgraph values->to_numpy --- swmmanywhere/metric_utilities.py | 36 +++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index f1859347..9318fd10 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -87,12 +87,41 @@ def align_calc_nse(synthetic_results: pd.DataFrame, how='outer').sort_values(by='date') # Interpolate to time in real data - df['value_syn'] = df.set_index('date').value_syn.interpolate().values + df['value_syn'] = df.set_index('date').value_syn.interpolate().to_numpy() df = df.dropna(subset=['value_real']) # Calculate NSE return nse(df.value_real, df.value_syn) +def create_subgraph(G: nx.Graph, + nodes: list) -> nx.Graph: + """Create a subgraph. + + Create a subgraph of G based on the nodes list. Taken from networkx + documentation: https://networkx.org/documentation/stable/reference/classes/generated/networkx.Graph.subgraph.html + + Args: + G (nx.Graph): The original graph. + nodes (list): The list of nodes to include in the subgraph. + + Returns: + nx.Graph: The subgraph. + """ + # Create a subgraph SG based on a (possibly multigraph) G + SG = G.__class__() + SG.add_nodes_from((n, G.nodes[n]) for n in nodes) + if SG.is_multigraph(): + SG.add_edges_from((n, nbr, key, d) + for n, nbrs in G.adj.items() if n in nodes + for nbr, keydict in nbrs.items() if nbr in nodes + for key, d in keydict.items()) + else: + SG.add_edges_from((n, nbr, d) + for n, nbrs in G.adj.items() if n in nodes + for nbr, d in nbrs.items() if nbr in nodes) + SG.graph.update(G.graph) + return SG + def nse(y: np.ndarray, yhat: np.ndarray) -> float: """Calculate Nash-Sutcliffe efficiency (NSE).""" @@ -133,7 +162,8 @@ def best_outlet_match(synthetic_G: nx.Graph, # Subselect the matching graph outlet_nodes = [n for n, d in synthetic_G.nodes(data=True) if d['outlet'] == outlet] - return synthetic_G.subgraph(outlet_nodes), outlet + sg = create_subgraph(synthetic_G,outlet_nodes) + return sg, outlet def dominant_outlet(G: nx.DiGraph, results: pd.DataFrame) -> tuple[nx.DiGraph,int]: @@ -165,7 +195,7 @@ def dominant_outlet(G: nx.DiGraph, if d['id'] == max_outlet_arc][0] # Subselect the matching graph - sg = G.subgraph(nx.ancestors(G, max_outlet) | {max_outlet}) + sg = create_subgraph(G, nx.ancestors(G, max_outlet) | {max_outlet}) return sg, max_outlet @metrics.register From 3699dd6a195515cd10c74bd9af9940b93aefa182 Mon Sep 17 00:00:00 2001 From: barneydobson Date: Thu, 7 Mar 2024 09:30:48 +0000 Subject: [PATCH 14/21] Update metric_utilities.py avoid intersects --- swmmanywhere/metric_utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 9318fd10..4c9d5cb0 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -153,7 +153,7 @@ def best_outlet_match(synthetic_G: nx.Graph, crs = synthetic_G.graph['crs']) .sjoin(real_subs, how="right", - predicate="intersects") + predicate="within") ) # Select the most common outlet From f12a790132b0544313f7b6a25c48d9ba980cb867 Mon Sep 17 00:00:00 2001 From: barneydobson Date: Thu, 7 Mar 2024 10:15:35 +0000 Subject: [PATCH 15/21] Ensure align_calc_nse dates are consistent format --- swmmanywhere/metric_utilities.py | 17 +++++++++++------ tests/test_metric_utilities.py | 6 +++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 4c9d5cb0..98fb7926 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -70,24 +70,29 @@ def align_calc_nse(synthetic_results: pd.DataFrame, efficiency (NSE) of the variable over time. In cases where the synthetic data is does not overlap the real data, the value is interpolated. """ + # Format dates + synthetic_results['date'] = pd.to_datetime(synthetic_results['date']) + real_results['date'] = pd.to_datetime(real_results['date']) + # Extract data syn_data = extract_var(synthetic_results, variable) syn_data = syn_data.loc[syn_data.object.isin(syn_ids)] - syn_data = syn_data.groupby('date').value.sum().reset_index() + syn_data = syn_data.groupby('date').value.sum() real_data = extract_var(real_results, variable) real_data = real_data.loc[real_data.object.isin(real_ids)] - real_data = real_data.groupby('date').value.sum().reset_index() - + real_data = real_data.groupby('date').value.sum() + # Align data df = pd.merge(syn_data, real_data, - on='date', + left_index = True, + right_index = True, suffixes=('_syn', '_real'), - how='outer').sort_values(by='date') + how='outer').sort_index() # Interpolate to time in real data - df['value_syn'] = df.set_index('date').value_syn.interpolate().to_numpy() + df['value_syn'] = df.value_syn.interpolate().to_numpy() df = df.dropna(subset=['value_real']) # Calculate NSE diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py index a09ae1ed..5a72f5bb 100644 --- a/tests/test_metric_utilities.py +++ b/tests/test_metric_utilities.py @@ -123,11 +123,11 @@ def test_outlet_nse_flow(): results = pd.DataFrame([{'object' : 4253560, 'variable' : 'flow', 'value' : 10, - 'date' : pd.to_datetime('2021-01-01 00:00:00')}, + 'date' : pd.to_datetime('2021-01-01').date()}, {'object' : '', 'variable' : 'flow', 'value' : 5, - 'date' : pd.to_datetime('2021-01-01 00:00:00')}, + 'date' : pd.to_datetime('2021-01-01').date()}, {'object' : 4253560, 'variable' : 'flow', 'value' : 5, @@ -136,7 +136,7 @@ def test_outlet_nse_flow(): 'variable' : 'flow', 'value' : 2, 'date' : pd.to_datetime('2021-01-01 00:00:05')}]) - + # Calculate NSE (perfect results) val = mu.metrics.outlet_nse_flow(synthetic_G = G, synthetic_results = results, From 927c74bd9ae81a8bc4e9da6193e1ebb8f7e556de Mon Sep 17 00:00:00 2001 From: Dobson Date: Thu, 7 Mar 2024 11:40:44 +0000 Subject: [PATCH 16/21] Add netcomp metrics --- dev-requirements.txt | 15 +++---- pyproject.toml | 3 +- requirements.txt | 15 +++---- swmmanywhere/metric_utilities.py | 72 ++++++++++++++++++++++++++++++++ tests/test_metric_utilities.py | 18 ++++++++ 5 files changed, 101 insertions(+), 22 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 066ce393..aac648e5 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -92,14 +92,9 @@ geojson==3.1.0 geopandas==0.14.2 # via # osmnx - # snkit # swmmanywhere (pyproject.toml) geopy==2.4.1 # via swmmanywhere (pyproject.toml) -gitdb==4.0.11 - # via gitpython -gitpython==3.1.41 - # via swmmanywhere (pyproject.toml) identify==2.5.33 # via pre-commit idna==3.6 @@ -130,8 +125,11 @@ mypy-extensions==1.0.0 # via mypy netcdf4==1.6.5 # via swmmanywhere (pyproject.toml) +netcomp @ git+https://github.com/barneydobson/NetComp.git + # via swmmanywhere (pyproject.toml) networkx==3.2.1 # via + # netcomp # osmnx # scikit-image # swmmanywhere (pyproject.toml) @@ -147,6 +145,7 @@ numpy==1.26.3 # imageio # matplotlib # netcdf4 + # netcomp # numba # osmnx # pandas @@ -259,6 +258,7 @@ scikit-image==0.22.0 # via pysheds scipy==1.12.0 # via + # netcomp # pysheds # salib # scikit-image @@ -267,16 +267,11 @@ shapely==2.0.2 # via # geopandas # osmnx - # snkit # swmmanywhere (pyproject.toml) six==1.16.0 # via # fiona # python-dateutil -smmap==5.0.1 - # via gitdb -snkit==1.9.0 - # via swmmanywhere (pyproject.toml) snuggs==1.4.7 # via rasterio swmm-toolkit==0.15.3 diff --git a/pyproject.toml b/pyproject.toml index 673acf7d..57d1fb34 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,9 +29,9 @@ dependencies = [ "fiona", "geopandas", "geopy", - "GitPython", "matplotlib", "netcdf4", + "netcomp@ git+https://github.com/barneydobson/NetComp.git", "networkx", "numpy", "osmnx", @@ -46,7 +46,6 @@ dependencies = [ "SALib", "SciPy", "shapely", - "snkit", "tqdm", "xarray", ] diff --git a/requirements.txt b/requirements.txt index cde240ba..69e76bca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -74,14 +74,9 @@ geojson==3.1.0 geopandas==0.14.2 # via # osmnx - # snkit # swmmanywhere (pyproject.toml) geopy==2.4.1 # via swmmanywhere (pyproject.toml) -gitdb==4.0.11 - # via gitpython -gitpython==3.1.41 - # via swmmanywhere (pyproject.toml) idna==3.6 # via requests imageio==2.33.1 @@ -102,8 +97,11 @@ multiprocess==0.70.15 # via salib netcdf4==1.6.5 # via swmmanywhere (pyproject.toml) +netcomp @ git+https://github.com/barneydobson/NetComp.git + # via swmmanywhere (pyproject.toml) networkx==3.2.1 # via + # netcomp # osmnx # scikit-image # swmmanywhere (pyproject.toml) @@ -117,6 +115,7 @@ numpy==1.26.3 # imageio # matplotlib # netcdf4 + # netcomp # numba # osmnx # pandas @@ -201,6 +200,7 @@ scikit-image==0.22.0 # via pysheds scipy==1.12.0 # via + # netcomp # pysheds # salib # scikit-image @@ -209,16 +209,11 @@ shapely==2.0.2 # via # geopandas # osmnx - # snkit # swmmanywhere (pyproject.toml) six==1.16.0 # via # fiona # python-dateutil -smmap==5.0.1 - # via gitdb -snkit==1.9.0 - # via swmmanywhere (pyproject.toml) snuggs==1.4.7 # via rasterio swmm-toolkit==0.15.3 diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 2e943989..d528cfbe 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -7,6 +7,7 @@ from typing import Callable import geopandas as gpd +import netcomp import networkx as nx import numpy as np import pandas as pd @@ -168,6 +169,77 @@ def dominant_outlet(G: nx.Graph, sg = G.subgraph(nx.ancestors(G, max_outlet) | {max_outlet}) return sg, max_outlet +def nc_compare(G1, G2, funcname, **kw): + """Compare two graphs using netcomp.""" + A1,A2 = [nx.adjacency_matrix(G) for G in (G1,G2)] + return getattr(netcomp, funcname)(A1,A2,**kw) + +@metrics.register +def nc_deltacon0(synthetic_G: nx.Graph, + real_G: nx.Graph, + **kwargs) -> float: + """Run the evaluated metric.""" + return nc_compare(synthetic_G, + real_G, + 'deltacon0', + eps = 1e-10) + +@metrics.register +def nc_laplacian_dist(synthetic_G: nx.Graph, + real_G: nx.Graph, + **kwargs) -> float: + """Run the evaluated metric.""" + return nc_compare(synthetic_G, + real_G, + 'lambda_dist', + k=10, + kind = 'laplacian') + +@metrics.register +def nc_laplacian_norm_dist(synthetic_G: nx.Graph, + real_G: nx.Graph, + **kwargs) -> float: + """Run the evaluated metric.""" + return nc_compare(synthetic_G, + real_G, + 'lambda_dist', + k=10, + kind = 'laplacian_norm') + +@metrics.register +def nc_adjacency_dist(synthetic_G: nx.Graph, + real_G: nx.Graph, + **kwargs) -> float: + """Run the evaluated metric.""" + return nc_compare(synthetic_G, + real_G, + 'lambda_dist', + k=10, + kind = 'adjacency') + +@metrics.register +def nc_vertex_edge_distance(synthetic_G: nx.Graph, + real_G: nx.Graph, + **kwargs) -> float: + """Run the evaluated metric. + + Do '1 -' because this metric is similarity not distance. + """ + return 1 - nc_compare(synthetic_G, + real_G, + 'vertex_edge_distance') + +@metrics.register +def nc_resistance_distance(synthetic_G: nx.Graph, + real_G: nx.Graph, + **kwargs) -> float: + """Run the evaluated metric.""" + return nc_compare(synthetic_G, + real_G, + 'resistance_distance', + check_connected = False, + renormalized = True) + @metrics.register def bias_flood_depth( synthetic_results: pd.DataFrame, diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py index fa8be33a..c19219b5 100644 --- a/tests/test_metric_utilities.py +++ b/tests/test_metric_utilities.py @@ -263,3 +263,21 @@ def test_outlet_nse_flooding(): real_results = results, real_subs = subs) assert val == 0.0 + +def test_netcomp(): + """Test the netcomp metrics.""" + netcomp_results = {'nc_deltacon0' : 0.00129408, + 'nc_laplacian_dist' : 36.334773, + 'nc_laplacian_norm_dist' : 1.932007, + 'nc_adjacency_dist' : 3.542749, + 'nc_resistance_distance' : 8.098548, + 'nc_vertex_edge_distance' : 0.132075} + + for func, val in netcomp_results.items(): + G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') + val_ = getattr(mu.metrics, func)(synthetic_G = G, real_G = G) + assert val_ == 0.0, func + + G_ = load_graph(Path(__file__).parent / 'test_data' / 'street_graph.json') + val_ = getattr(mu.metrics, func)(synthetic_G = G_, real_G = G) + assert np.isclose(val, val_), func \ No newline at end of file From 1c7b9550b0d9646e1df25abea78019052d3b8558 Mon Sep 17 00:00:00 2001 From: Dobson Date: Thu, 7 Mar 2024 13:43:16 +0000 Subject: [PATCH 17/21] test new metrics --- swmmanywhere/misc/debug_topology.py | 60 +++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 swmmanywhere/misc/debug_topology.py diff --git a/swmmanywhere/misc/debug_topology.py b/swmmanywhere/misc/debug_topology.py new file mode 100644 index 00000000..86034b42 --- /dev/null +++ b/swmmanywhere/misc/debug_topology.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +"""Created 2024-03-07. + +@author: Barnaby Dobson +""" +from pathlib import Path +from time import time + +import matplotlib.pyplot as plt +import pandas as pd + +from swmmanywhere import metric_utilities as mu +from swmmanywhere.prepare_data import download_street + +# Download streets +Gs = download_street((-0.09547,51.52196,-0.09232,51.52422)) +Gm = download_street((-0.09547,51.52196,-0.08876,51.52661)) +Gl = download_street((-0.09547,51.52196,-0.08261,51.53097)) +Gsep = download_street((-0.08261,51.53097,-0.06946,51.53718)) + +# Define topological metrics +metrics = ['nc_deltacon0', + 'nc_laplacian_dist' , + 'nc_laplacian_norm_dist' , + 'nc_adjacency_dist' , + 'nc_resistance_distance' , + 'nc_vertex_edge_distance', + 'kstest_betweenness' + ] + +# Calculate metrics +results = [] +for G1,l1 in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']): + for G2,l2 in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']): + for func in metrics: + start = time() + val_ = getattr(mu.metrics, func)(synthetic_G = G1, real_G = G2) + end = time() + results.append({'func':func, + 'val':val_, + 'time':end - start, + 'l1':l1, + 'l2':l2}) +df = pd.DataFrame(results).sort_values(by=['l1','l2']) + +# Plot heatmap by func +f,axs = plt.subplots(4,2,figsize=(10,10)) +for (func,grp),ax in zip(df.groupby('func'),axs.reshape(-1)): + grp = grp.pivot(index='l1', + columns= 'l2', + values='val') + ax.imshow(grp,cmap='Reds') + ax.set_xticks(range(4)) + ax.set_yticks(range(4)) + ax.set_xticklabels(grp.columns,rotation=90) + ax.set_yticklabels(grp.index) + ax.set_title(func) +plots = Path(__file__).parent +f.tight_layout() +f.savefig(plots / 'heatmap.png') \ No newline at end of file From 64d7b0397d65c3edac2401f35e0a4b92edbf45e5 Mon Sep 17 00:00:00 2001 From: barneydobson Date: Mon, 11 Mar 2024 14:13:35 +0000 Subject: [PATCH 18/21] BC changes -Use taher's new function for edge betweenness -introduce new metric for edge betweenness (in contrast to node betweenness) -update requirements --- dev-requirements.txt | 18 +++++- pyproject.toml | 2 + requirements.txt | 8 ++- swmmanywhere/metric_utilities.py | 38 +++++++++++- swmmanywhere/misc/debug_topology.py | 89 +++++++++++++++-------------- tests/test_metric_utilities.py | 11 ++++ 6 files changed, 118 insertions(+), 48 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index aac648e5..21c25d87 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile --extra=dev --output-file=dev-requirements.txt @@ -67,10 +67,14 @@ cramjam==2.7.0 # via fastparquet cycler==0.12.1 # via matplotlib +cytoolz==0.12.3 + # via swmmanywhere (pyproject.toml) dill==0.3.7 # via multiprocess distlib==0.3.8 # via virtualenv +exceptiongroup==1.2.0 + # via pytest fastparquet==2023.10.1 # via swmmanywhere (pyproject.toml) filelock==3.13.1 @@ -103,6 +107,8 @@ imageio==2.33.1 # via scikit-image iniconfig==2.0.0 # via pytest +joblib==1.3.2 + # via swmmanywhere (pyproject.toml) julian==0.14 # via pyswmm kiwisolver==1.4.5 @@ -278,6 +284,16 @@ swmm-toolkit==0.15.3 # via pyswmm tifffile==2024.1.30 # via scikit-image +tomli==2.0.1 + # via + # build + # coverage + # mypy + # pip-tools + # pyproject-hooks + # pytest +toolz==0.12.1 + # via cytoolz tqdm==4.66.2 # via # cdsapi diff --git a/pyproject.toml b/pyproject.toml index 57d1fb34..2f4f2d87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,10 +25,12 @@ classifiers = [ dependencies = [ # TODO definitely don't need all of these "cdsapi", + "cytoolz", "fastparquet", "fiona", "geopandas", "geopy", + "joblib", "matplotlib", "netcdf4", "netcomp@ git+https://github.com/barneydobson/NetComp.git", diff --git a/requirements.txt b/requirements.txt index 69e76bca..4fa086d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile @@ -55,6 +55,8 @@ cramjam==2.7.0 # via fastparquet cycler==0.12.1 # via matplotlib +cytoolz==0.12.3 + # via swmmanywhere (pyproject.toml) dill==0.3.7 # via multiprocess fastparquet==2023.10.1 @@ -81,6 +83,8 @@ idna==3.6 # via requests imageio==2.33.1 # via scikit-image +joblib==1.3.2 + # via swmmanywhere (pyproject.toml) julian==0.14 # via pyswmm kiwisolver==1.4.5 @@ -220,6 +224,8 @@ swmm-toolkit==0.15.3 # via pyswmm tifffile==2024.1.30 # via scikit-image +toolz==0.12.1 + # via cytoolz tqdm==4.66.2 # via # cdsapi diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index fbe6755e..e138ecfa 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -12,7 +12,10 @@ import numpy as np import pandas as pd from scipy import stats - +import joblib +import networkx as nx +import cytoolz.curried as tlz +from collections import defaultdict class MetricRegistry(dict): """Registry object.""" @@ -209,6 +212,22 @@ def nc_compare(G1, G2, funcname, **kw): A1,A2 = [nx.adjacency_matrix(G) for G in (G1,G2)] return getattr(netcomp, funcname)(A1,A2,**kw) +def edge_betweenness_centrality(G: nx.Graph, normalized: bool = True, weight: str = "weight", njobs: int = -1): + """Parallel betweenness centrality function""" + njobs = joblib.cpu_count(True) if njobs == -1 else njobs + node_chunks = tlz.partition_all(G.order() // njobs, G.nodes()) + bt_func = tlz.partial(nx.edge_betweenness_centrality_subset, G=G, normalized=normalized, weight=weight) + bt_sc = joblib.Parallel(n_jobs=njobs)( + joblib.delayed(bt_func)(sources=nodes, targets=G.nodes()) for nodes in node_chunks + ) + + # Merge the betweenness centrality results + bt_c = defaultdict(float) + for bt in bt_sc: + for n, v in bt.items(): + bt_c[n] += v + return bt_c + @metrics.register def nc_deltacon0(synthetic_G: nx.Graph, real_G: nx.Graph, @@ -299,14 +318,27 @@ def _f(x): return (syn_tot - real_tot) / real_tot +@metrics.register +def kstest_edge_betweenness( + synthetic_G: nx.Graph, + real_G: nx.Graph, + **kwargs) -> float: + """Run the evaluated metric.""" + syn_betweenness = edge_betweenness_centrality(synthetic_G, weight=None) + real_betweenness = edge_betweenness_centrality(real_G, weight=None) + + #TODO does it make more sense to use statistic or pvalue? + return stats.ks_2samp(list(syn_betweenness.values()), + list(real_betweenness.values())).statistic + @metrics.register def kstest_betweenness( synthetic_G: nx.Graph, real_G: nx.Graph, **kwargs) -> float: """Run the evaluated metric.""" - syn_betweenness = nx.betweenness_centrality(synthetic_G) - real_betweenness = nx.betweenness_centrality(real_G) + syn_betweenness = nx.betweenness_centrality(synthetic_G, weight=None) + real_betweenness = nx.betweenness_centrality(real_G, weight=None) #TODO does it make more sense to use statistic or pvalue? return stats.ks_2samp(list(syn_betweenness.values()), diff --git a/swmmanywhere/misc/debug_topology.py b/swmmanywhere/misc/debug_topology.py index 86034b42..2593779a 100644 --- a/swmmanywhere/misc/debug_topology.py +++ b/swmmanywhere/misc/debug_topology.py @@ -12,49 +12,52 @@ from swmmanywhere import metric_utilities as mu from swmmanywhere.prepare_data import download_street -# Download streets -Gs = download_street((-0.09547,51.52196,-0.09232,51.52422)) -Gm = download_street((-0.09547,51.52196,-0.08876,51.52661)) -Gl = download_street((-0.09547,51.52196,-0.08261,51.53097)) -Gsep = download_street((-0.08261,51.53097,-0.06946,51.53718)) +if __name__ == '__main__': + + # Download streets + Gs = download_street((-0.09547,51.52196,-0.09232,51.52422)) + Gm = download_street((-0.09547,51.52196,-0.08876,51.52661)) + Gl = download_street((-0.09547,51.52196,-0.08261,51.53097)) + Gsep = download_street((-0.08261,51.53097,-0.06946,51.53718)) -# Define topological metrics -metrics = ['nc_deltacon0', - 'nc_laplacian_dist' , - 'nc_laplacian_norm_dist' , - 'nc_adjacency_dist' , - 'nc_resistance_distance' , - 'nc_vertex_edge_distance', - 'kstest_betweenness' - ] + # Define topological metrics + metrics = ['nc_deltacon0', + 'nc_laplacian_dist' , + 'nc_laplacian_norm_dist' , + 'nc_adjacency_dist' , + 'nc_resistance_distance' , + 'nc_vertex_edge_distance', + 'kstest_betweenness', + 'kstest_edge_betweenness', + ] -# Calculate metrics -results = [] -for G1,l1 in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']): - for G2,l2 in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']): - for func in metrics: - start = time() - val_ = getattr(mu.metrics, func)(synthetic_G = G1, real_G = G2) - end = time() - results.append({'func':func, - 'val':val_, - 'time':end - start, - 'l1':l1, - 'l2':l2}) -df = pd.DataFrame(results).sort_values(by=['l1','l2']) + # Calculate metrics + results = [] + for G1,l1 in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']): + for G2,l2 in zip([Gs, Gm, Gl, Gsep], ['a_small', 'b_med', 'c_large', 'd_sep']): + for func in metrics: + start = time() + val_ = getattr(mu.metrics, func)(synthetic_G = G1, real_G = G2) + end = time() + results.append({'func':func, + 'val':val_, + 'time':end - start, + 'l1':l1, + 'l2':l2}) + df = pd.DataFrame(results).sort_values(by=['l1','l2']) -# Plot heatmap by func -f,axs = plt.subplots(4,2,figsize=(10,10)) -for (func,grp),ax in zip(df.groupby('func'),axs.reshape(-1)): - grp = grp.pivot(index='l1', - columns= 'l2', - values='val') - ax.imshow(grp,cmap='Reds') - ax.set_xticks(range(4)) - ax.set_yticks(range(4)) - ax.set_xticklabels(grp.columns,rotation=90) - ax.set_yticklabels(grp.index) - ax.set_title(func) -plots = Path(__file__).parent -f.tight_layout() -f.savefig(plots / 'heatmap.png') \ No newline at end of file + # Plot heatmap by func + f,axs = plt.subplots(4,2,figsize=(10,10)) + for (func,grp),ax in zip(df.groupby('func'),axs.reshape(-1)): + grp = grp.pivot(index='l1', + columns= 'l2', + values='val') + ax.imshow(grp,cmap='Reds') + ax.set_xticks(range(4)) + ax.set_yticks(range(4)) + ax.set_xticklabels(grp.columns,rotation=90) + ax.set_yticklabels(grp.index) + ax.set_title(func) + plots = Path(__file__).parent + f.tight_layout() + f.savefig(plots / 'heatmap.png') \ No newline at end of file diff --git a/tests/test_metric_utilities.py b/tests/test_metric_utilities.py index 978c61d0..5918f06c 100644 --- a/tests/test_metric_utilities.py +++ b/tests/test_metric_utilities.py @@ -92,6 +92,17 @@ def test_kstest_betweenness(): val = mu.metrics.kstest_betweenness(synthetic_G = G_, real_G = G) assert_close(val, 0.2862) +def test_kstest_edge_betweenness(): + """Test the kstest_betweenness metric.""" + G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') + val = mu.metrics.kstest_edge_betweenness(synthetic_G = G, real_G = G) + assert val == 0.0 + + G_ = G.copy() + G_.remove_node(list(G.nodes)[0]) + val = mu.metrics.kstest_edge_betweenness(synthetic_G = G_, real_G = G) + assert_close(val, 0.38995) + def test_best_outlet_match(): """Test the best_outlet_match and ks_betweenness.""" G = load_graph(Path(__file__).parent / 'test_data' / 'graph_topo_derived.json') From eb5a6019c51897bed0e765c52c5da92f923de0d9 Mon Sep 17 00:00:00 2001 From: barneydobson Date: Mon, 11 Mar 2024 14:19:19 +0000 Subject: [PATCH 19/21] Update metric_utilities.py formatting --- swmmanywhere/metric_utilities.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index e138ecfa..84f651fc 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -3,19 +3,19 @@ @author: Barnaby Dobson """ +from collections import defaultdict from inspect import signature from typing import Callable +import cytoolz.curried as tlz import geopandas as gpd +import joblib import netcomp import networkx as nx import numpy as np import pandas as pd from scipy import stats -import joblib -import networkx as nx -import cytoolz.curried as tlz -from collections import defaultdict + class MetricRegistry(dict): """Registry object.""" @@ -212,13 +212,20 @@ def nc_compare(G1, G2, funcname, **kw): A1,A2 = [nx.adjacency_matrix(G) for G in (G1,G2)] return getattr(netcomp, funcname)(A1,A2,**kw) -def edge_betweenness_centrality(G: nx.Graph, normalized: bool = True, weight: str = "weight", njobs: int = -1): - """Parallel betweenness centrality function""" +def edge_betweenness_centrality(G: nx.Graph, + normalized: bool = True, + weight: str = "weight", + njobs: int = -1): + """Parallel betweenness centrality function.""" njobs = joblib.cpu_count(True) if njobs == -1 else njobs node_chunks = tlz.partition_all(G.order() // njobs, G.nodes()) - bt_func = tlz.partial(nx.edge_betweenness_centrality_subset, G=G, normalized=normalized, weight=weight) + bt_func = tlz.partial(nx.edge_betweenness_centrality_subset, + G=G, + normalized=normalized, + weight=weight) bt_sc = joblib.Parallel(n_jobs=njobs)( - joblib.delayed(bt_func)(sources=nodes, targets=G.nodes()) for nodes in node_chunks + joblib.delayed(bt_func)(sources=nodes, + targets=G.nodes()) for nodes in node_chunks ) # Merge the betweenness centrality results From bdcbd8b376fb6f1f0013a9ea394e6f8b36ca7483 Mon Sep 17 00:00:00 2001 From: barneydobson Date: Mon, 11 Mar 2024 14:21:15 +0000 Subject: [PATCH 20/21] Update pyproject.toml formatting --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index abd90bcf..a3c4e1a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,8 +30,8 @@ dependencies = [ "fiona", "geopandas", "geopy", - "joblib", "GitPython", + "joblib", "loguru", "matplotlib", "netcdf4", From 4cf2675b79b1975b2ac7a55b55ae551bba8d39d9 Mon Sep 17 00:00:00 2001 From: barneydobson Date: Mon, 11 Mar 2024 14:27:59 +0000 Subject: [PATCH 21/21] Update metric_utilities.py more formatting... --- swmmanywhere/metric_utilities.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/swmmanywhere/metric_utilities.py b/swmmanywhere/metric_utilities.py index 84f651fc..77b76eae 100644 --- a/swmmanywhere/metric_utilities.py +++ b/swmmanywhere/metric_utilities.py @@ -5,7 +5,7 @@ """ from collections import defaultdict from inspect import signature -from typing import Callable +from typing import Callable, Optional import cytoolz.curried as tlz import geopandas as gpd @@ -214,7 +214,7 @@ def nc_compare(G1, G2, funcname, **kw): def edge_betweenness_centrality(G: nx.Graph, normalized: bool = True, - weight: str = "weight", + weight: Optional[str] = "weight", njobs: int = -1): """Parallel betweenness centrality function.""" njobs = joblib.cpu_count(True) if njobs == -1 else njobs @@ -229,7 +229,7 @@ def edge_betweenness_centrality(G: nx.Graph, ) # Merge the betweenness centrality results - bt_c = defaultdict(float) + bt_c: dict[int, float] = defaultdict(float) for bt in bt_sc: for n, v in bt.items(): bt_c[n] += v