Skip to content

Commit

Permalink
polygonize rasters with weights (#23)
Browse files Browse the repository at this point in the history
* weights works

* fix test

---------

Co-authored-by: fdobad <8339628-fdobad@users.noreply.gitlab.com>
  • Loading branch information
fdobad and fdobad authored Nov 19, 2024
1 parent 62c0e49 commit b3c7506
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 24 deletions.
99 changes: 76 additions & 23 deletions src/fire2a/agglomerative_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
## Usage
### Overview
1. Choose your raster files
2. Configure nodata and scaling strategies in the `config.toml` file
3. Choose "number of clusters" or "distance threshold" for the [Agglomerative](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html) clustering algorithm
- Start with a distance threshold of 10.0 and decrease for less or increase for more clusters
2. Configure nodata, scaling strategies and weights in the `config.toml` file
3. Choose "distance threshold" (or "number of clusters") for the [Agglomerative](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html) clustering algorithm. Recommended:
- Start with a distance threshold of 10.0 and decrease for more or increase for less clusters
- After calibrating the distance threshold;
- [Sieve](https://gdal.org/en/latest/programs/gdal_sieve.html) small clusters (merge them to the biggest neighbor) with the `--sieve integer_pixels_size` option
Expand Down Expand Up @@ -41,6 +41,7 @@
no_data_strategy = "most_frequent"
scaling_strategy = "onehot"
fill_value = 0
weight = 1
```
1. __scaling_strategy__
Expand All @@ -62,13 +63,33 @@
- default is 0
- [SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)
4. __weight__
- default is 1
- used to give more importance to some features than others
- This is done after the nodata imputation and scaling steps, before clustering
#### 3. Clustering configuration
1. __Agglomerative__ clustering algorithm is used. The following parameters are muttually exclusive:
- `-n` or `--n_clusters`: The number of clusters to form as well as the number of centroids to generate.
- `-d` or `--distance_threshold`: The linkage distance threshold above which, clusters will not be merged. When scaling start with 10.0 and downward (0.0 is compute the whole algorithm).
For passing more parameters, see [here](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html)
- More [parameters](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html) for clustering can be passed directly into the pipelie method as keyword arguments
2. __Sieve filter__ is applied to remove small clusters. The sieve filter is applied using the [GDAL sieve library](https://gdal.org/en/latest/programs/gdal_sieve.html#gdal-sieve)
- `--sieve`: Use GDAL sieve filter to merge small clusters (number of pixels) into the biggest neighbor
#### 4. Post-processing
Outputs can be:
- A raster file with the cluster labels and a polygon file with the cluster polygons
- A polygon file with the cluster polygons, with attribute being the number of pixels in each cluster
- A plot of the input data distributions, the rescaled data distributions, and the cluster size history and histogram (crashes QGIS in windows)
Or use the `--script` option to return the label_map and the pipeline object for further processing in another python script:
```python
from fire2a.agglomerative_clustering import main
label_map, pipe1, pipe2 = main(["-d", "10.0", "-s"])
```
"""
# fmt: on
# from IPython.terminal.embed import InteractiveShellEmbed
Expand Down Expand Up @@ -159,22 +180,22 @@ def transform(self, X):
class RescaleAllToCommonRange(BaseEstimator, TransformerMixin):
"""A custom transformer that rescales all features to a common range [0, 1]"""

def __init__(self):
pass
def __init__(self, weight_map):
self.weight_map = weight_map

def fit(self, X, y=None):
# Determine the combined range of all scaled features
self.min_val = [x.min() for x in X]
self.max_val = [x.max() for x in X]
self.min_val = [x.min() for x in X.T]
self.max_val = [x.max() for x in X.T]
return self

def transform(self, X):
# Rescale all features to match the common range
for i, (x, mi, ma) in enumerate(zip(X, self.min_val, self.max_val)):
for i, (x, mi, ma) in enumerate(zip(X.T, self.min_val, self.max_val)):
if ma - mi == 0:
X[i] = x
X.T[i] = x * self.weight_map[i]
else:
X[i] = (x - mi) / (ma - mi)
X.T[i] = (x - mi) / (ma - mi) * self.weight_map[i]
return X


Expand Down Expand Up @@ -235,6 +256,7 @@ def pipelie(observations, info_list, height, width, **kwargs):
no_data_values = [info["NoDataValue"] for info in info_list]
no_data_strategies = [info["no_data_strategy"] for info in info_list]
fill_values = [info["fill_value"] for info in info_list]
weights = [info["weight"] for info in info_list]
# scaling_strategies = [info["scaling_strategy"] for info in info_list]

# scaling strategies
Expand All @@ -248,6 +270,7 @@ def pipelie(observations, info_list, height, width, **kwargs):
robust_transformer = Pipeline(steps=[("robust_step", RobustScaler())])
standard_transformer = Pipeline(steps=[("standard_step", StandardScaler())])
onehot_transformer = Pipeline(steps=[("onehot_step", OneHotEncoder(sparse_output=False))])
# OneHotEncoder._n_features_outs):

# Combine transformers using ColumnTransformer
feature_scaler = ColumnTransformer(
Expand All @@ -266,23 +289,49 @@ def pipelie(observations, info_list, height, width, **kwargs):
# memory = joblib.Memory(location=temp_dir, verbose=0)

# Create and apply the pipeline
pipeline = Pipeline(
# part 1 until feature scaling
pipe1 = Pipeline(
# n_features_in_ : int
# feature_names_in_ : ndarray of shape (`n_features_in_`,)
steps=[
("no_data_imputer", NoDataImputer(no_data_values, no_data_strategies, fill_values)),
("feature_scaling", feature_scaler),
("common_rescaling", RescaleAllToCommonRange()),
],
# memory=memory,
verbose=True,
)
# map weights to new columns (onehot feature scaler creates one column per category)
obs1 = pipe1.fit_transform(observations)
cat_names = pipe1.named_steps["feature_scaling"]["onehot"].get_feature_names_out()
split_names = [name.split("_")[0] for name in cat_names]
cat_count = np.unique(split_names, return_counts=True)[1]
onehot_map = {}
for i, key in enumerate(index_map["onehot"]):
onehot_map[key] = cat_count[i]
# onehot_map = {key: cat_count[i] for i, key in enumerate(index_map["onehot"])}
weight_map = []
for name, idxs in index_map.items():
for idx in idxs:
if name == "onehot":
weight_map += [weights[idx]] * onehot_map[idx]
continue
weight_map += [weights[idx]]
# part 2 use weight_map and cluster
pipe2 = Pipeline(
steps=[
("common_rescaling", RescaleAllToCommonRange(weight_map)),
("agglomerative_clustering", CustomAgglomerativeClustering(height, width, neighbors=4, **kwargs)),
],
# memory=memory,
verbose=True,
)

# apply pipeLIE
labels = pipeline.fit_predict(observations)
labels = pipe2.fit_predict(obs1)

# Reshape the labels back to the original spatial map shape
labels_reshaped = labels.reshape(height, width)
return labels_reshaped, pipeline
return labels_reshaped, pipe1, pipe2


def write(
Expand Down Expand Up @@ -399,11 +448,12 @@ def write(
return True


def plot(labels_reshaped, pipeline, info_list, **kwargs):
def plot(labels_reshaped, pipe1, pipe2, info_list, **kwargs):
"""Plot the observed values of the input data, the rescaled data, and the cluster size history and histogram.
Args:
labels_reshaped (np.ndarray): The reshaped labels of the clusters
pipeline (Pipeline): The pipeline object containing all the steps of the pipeline
pipe1 (Pipeline): The first pipeline object containing imputer and feature scaling steps
pipe2 (Pipeline): The second pipeline object containing the rescaling and clustering steps
info_list (list): A list of dictionaries containing information about each feature
**kargs: Additional keyword arguments
n_clusters (int): The number of clusters
Expand All @@ -414,8 +464,8 @@ def plot(labels_reshaped, pipeline, info_list, **kwargs):
"""
from matplotlib import pyplot as plt

no_data_imputed = pipeline.named_steps["no_data_imputer"].output_data
pre_aggclu_data = pipeline.named_steps["agglomerative_clustering"].input_data
no_data_imputed = pipe1.named_steps["no_data_imputer"].output_data
pre_aggclu_data = pipe2.named_steps["agglomerative_clustering"].input_data

# filtrar onehot
num_onehots = sum([1 for i in info_list if i["scaling_strategy"] == "onehot"])
Expand Down Expand Up @@ -667,6 +717,8 @@ def main(argv=None):
config[filename]["scaling_strategy"] = "robust"
if "fill_value" not in file_config:
config[filename]["fill_value"] = 0
if "weight" not in file_config:
config[filename]["weight"] = 1
logger.debug(config)

# 3. LEE DATA
Expand All @@ -679,6 +731,7 @@ def main(argv=None):
info["no_data_strategy"] = file_config["no_data_strategy"]
info["scaling_strategy"] = file_config["scaling_strategy"]
info["fill_value"] = file_config["fill_value"]
info["weight"] = file_config["weight"]
data_list += [data]
info_list += [info]
logger.debug("%s", data[:2, :2])
Expand All @@ -691,7 +744,7 @@ def main(argv=None):
observations = np.column_stack([data.ravel() for data in data_list])

# 6. nodata -> feature scaling -> all scaling -> clustering
labels_reshaped, pipeline = pipelie(
labels_reshaped, pipe1, pipe2 = pipelie(
observations,
info_list,
height,
Expand All @@ -709,7 +762,7 @@ def main(argv=None):

# 7 debbuging plots
if args.plots:
plot(labels_reshaped, pipeline, info_list, **vars(args))
plot(labels_reshaped, pipe1, pipe2, info_list, **vars(args))

# 8. ESCRIBIR RASTER
if not args.no_write:
Expand All @@ -726,7 +779,7 @@ def main(argv=None):

# 9. SCRIPT MODE
if args.script:
return labels_reshaped, pipeline # en ipython me falla tambien
return labels_reshaped, pipe1, pipe2

return 0

Expand Down
2 changes: 1 addition & 1 deletion tests/test_agglomerative_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_agg1(request, tmp_path):

with MonkeyPatch.context() as mp:
mp.chdir(tmp_path)
label_map, pipeline = main(["-s", "-n", "10", "config.toml"])
label_map, pipe1, pipe2 = main(["-s", "-n", "10", "config.toml"])
assert Path("output.gpkg").is_file()

assert label_map.shape == (597, 658)
Expand Down

0 comments on commit b3c7506

Please sign in to comment.