Drop python 3.8, add python 3.12 support

DLR-RM · Nov 18, 2024 · e5a1028 · e5a1028
1 parent b8ff1a6
commit e5a1028
Show file tree

Hide file tree

Showing 16 changed files with 97 additions and 86 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -19,7 +19,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
         include:
           # Default version
           - gymnasium-version: "1.0.0"
@@ -51,6 +51,7 @@ jobs:
       - name: Install specific version of gym
         run: |
           uv pip install --system gymnasium==${{ matrix.gymnasium-version }}
+          uv pip install --system "numpy<2"
         # Only run for python 3.10, downgrade gym to 0.29.1
 
       - name: Lint with ruff
@@ -65,8 +66,6 @@ jobs:
       - name: Type check
         run: |
           make type
-        # Do not run for python 3.8 (mypy internal error)
-        if: matrix.python-version != '3.8'
       - name: Test with pytest
         run: |
           make pytest
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,19 @@
+## Release 2.5.0a0 (WIP)
+
+### Breaking Changes
+- Upgraded to Pytorch >= 2.3.0
+- Upgraded to SB3 >= 2.5.0
+
+### New Features
+- Added support for Numpy v2
+
+### Bug fixes
+
+### Documentation
+
+### Other
+
+
 ## Release 2.4.0 (2024-11-18)
 
 **New algorithm: CrossQ, Gymnasium v1.0 support, and better defaults for SAC/TQC on Swimmer-v4 env**

diff --git a/docs/conf.py b/docs/conf.py
@@ -14,7 +14,6 @@
 import datetime
 import os
 import sys
-from typing import Dict
 
 # We CANNOT enable 'sphinxcontrib.spelling' because ReadTheDocs.org does not support
 # PyEnchant.
@@ -151,7 +150,7 @@ def setup(app):
 
 # -- Options for LaTeX output ------------------------------------------------
 
-latex_elements: Dict[str, str] = {
+latex_elements: dict[str, str] = {
     # The paper size ('letterpaper' or 'a4paper').
     #
     # 'papersize': 'letterpaper',

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,8 @@
 [tool.ruff]
 # Same as Black.
 line-length = 127
-# Assume Python 3.8
-target-version = "py38"
+# Assume Python 3.9
+target-version = "py39"
 
 [tool.ruff.lint]
 # See https://beta.ruff.rs/docs/rules/

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 gym==0.26.2
-stable-baselines3[extra,tests,docs]>=2.4.0,<3.0
+stable-baselines3[extra,tests,docs]>=2.5.0a0,<3.0
 box2d-py==2.3.8
 pybullet_envs_gymnasium>=0.5.0
 # minigrid

diff --git a/rl_zoo3/benchmark.py b/rl_zoo3/benchmark.py
@@ -3,7 +3,6 @@
 import os
 import shutil
 import subprocess
-from typing import Dict, List
 
 import numpy as np
 import pandas as pd
@@ -33,7 +32,7 @@
     trained_models.update(get_hf_trained_models())
 
 n_experiments = len(trained_models)
-results: Dict[str, List] = {
+results: dict[str, list] = {
     "algo": [],
     "env_id": [],
     "mean_reward": [],

diff --git a/rl_zoo3/callbacks.py b/rl_zoo3/callbacks.py
@@ -4,7 +4,7 @@
 from copy import deepcopy
 from functools import wraps
 from threading import Thread
-from typing import Optional, Type, Union
+from typing import Optional, Union
 
 import optuna
 from sb3_contrib import TQC
@@ -119,7 +119,7 @@ def __init__(self, gradient_steps: int = 100, verbose: int = 0, sleep_time: floa
         self._model: Union[SAC, TQC]
         self.gradient_steps = gradient_steps
         self.process: Thread
-        self.model_class: Union[Type[SAC], Type[TQC]]
+        self.model_class: Union[type[SAC], type[TQC]]
         self.sleep_time = sleep_time
 
     def _init_callback(self) -> None:

diff --git a/rl_zoo3/exp_manager.py b/rl_zoo3/exp_manager.py
@@ -7,7 +7,7 @@
 from collections import OrderedDict
 from pathlib import Path
 from pprint import pprint
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import gymnasium as gym
 import numpy as np
@@ -71,9 +71,9 @@ def __init__(
         eval_freq: int = 10000,
         n_eval_episodes: int = 5,
         save_freq: int = -1,
-        hyperparams: Optional[Dict[str, Any]] = None,
-        env_kwargs: Optional[Dict[str, Any]] = None,
-        eval_env_kwargs: Optional[Dict[str, Any]] = None,
+        hyperparams: Optional[dict[str, Any]] = None,
+        env_kwargs: Optional[dict[str, Any]] = None,
+        eval_env_kwargs: Optional[dict[str, Any]] = None,
         trained_agent: str = "",
         optimize_hyperparameters: bool = False,
         storage: Optional[str] = None,
@@ -112,10 +112,10 @@ def __init__(
             default_path = Path(__file__).parent.parent
 
         self.config = config or str(default_path / f"hyperparams/{self.algo}.yml")
-        self.env_kwargs: Dict[str, Any] = env_kwargs or {}
+        self.env_kwargs: dict[str, Any] = env_kwargs or {}
         self.n_timesteps = n_timesteps
         self.normalize = False
-        self.normalize_kwargs: Dict[str, Any] = {}
+        self.normalize_kwargs: dict[str, Any] = {}
         self.env_wrapper: Optional[Callable] = None
         self.frame_stack = None
         self.seed = seed
@@ -124,23 +124,23 @@ def __init__(
         self.vec_env_class = {"dummy": DummyVecEnv, "subproc": SubprocVecEnv}[vec_env_type]
         self.vec_env_wrapper: Optional[Callable] = None
 
-        self.vec_env_kwargs: Dict[str, Any] = {}
+        self.vec_env_kwargs: dict[str, Any] = {}
         # self.vec_env_kwargs = {} if vec_env_type == "dummy" else {"start_method": "fork"}
 
         # Callbacks
-        self.specified_callbacks: List = []
-        self.callbacks: List[BaseCallback] = []
+        self.specified_callbacks: list = []
+        self.callbacks: list[BaseCallback] = []
         # Use env-kwargs if eval_env_kwargs was not specified
-        self.eval_env_kwargs: Dict[str, Any] = eval_env_kwargs or self.env_kwargs
+        self.eval_env_kwargs: dict[str, Any] = eval_env_kwargs or self.env_kwargs
         self.save_freq = save_freq
         self.eval_freq = eval_freq
         self.n_eval_episodes = n_eval_episodes
         self.n_eval_envs = n_eval_envs
 
         self.n_envs = 1  # it will be updated when reading hyperparams
         self.n_actions = 0  # For DDPG/TD3 action noise objects
-        self._hyperparams: Dict[str, Any] = {}
-        self.monitor_kwargs: Dict[str, Any] = {}
+        self._hyperparams: dict[str, Any] = {}
+        self.monitor_kwargs: dict[str, Any] = {}
 
         self.trained_agent = trained_agent
         self.continue_training = trained_agent.endswith(".zip") and os.path.isfile(trained_agent)
@@ -179,7 +179,7 @@ def __init__(
         )
         self.params_path = f"{self.save_path}/{self.env_name}"
 
-    def setup_experiment(self) -> Optional[Tuple[BaseAlgorithm, Dict[str, Any]]]:
+    def setup_experiment(self) -> Optional[tuple[BaseAlgorithm, dict[str, Any]]]:
         """
         Read hyperparameters, pre-process them (create schedules, wrappers, callbacks, action noise objects)
         create the environment and possibly the model.
@@ -223,7 +223,7 @@ def learn(self, model: BaseAlgorithm) -> None:
         """
         :param model: an initialized RL model
         """
-        kwargs: Dict[str, Any] = {}
+        kwargs: dict[str, Any] = {}
         if self.log_interval > -1:
             kwargs = {"log_interval": self.log_interval}
 
@@ -272,7 +272,7 @@ def save_trained_model(self, model: BaseAlgorithm) -> None:
             assert vec_normalize is not None
             vec_normalize.save(os.path.join(self.params_path, "vecnormalize.pkl"))
 
-    def _save_config(self, saved_hyperparams: Dict[str, Any]) -> None:
+    def _save_config(self, saved_hyperparams: dict[str, Any]) -> None:
         """
         Save unprocessed hyperparameters, this can be use later
         to reproduce an experiment.
@@ -290,15 +290,15 @@ def _save_config(self, saved_hyperparams: Dict[str, Any]) -> None:
 
         print(f"Log path: {self.save_path}")
 
-    def read_hyperparameters(self) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def read_hyperparameters(self) -> tuple[dict[str, Any], dict[str, Any]]:
         print(f"Loading hyperparameters from: {self.config}")
 
         if self.config.endswith(".yml") or self.config.endswith(".yaml"):
             # Load hyperparameters from yaml file
             with open(self.config) as f:
                 hyperparams_dict = yaml.safe_load(f)
         elif self.config.endswith(".py"):
-            global_variables: Dict = {}
+            global_variables: dict = {}
             # Load hyperparameters from python file
             exec(Path(self.config).read_text(), global_variables)
             hyperparams_dict = global_variables["hyperparams"]
@@ -327,7 +327,7 @@ def read_hyperparameters(self) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         return hyperparams, saved_hyperparams
 
     @staticmethod
-    def _preprocess_schedules(hyperparams: Dict[str, Any]) -> Dict[str, Any]:
+    def _preprocess_schedules(hyperparams: dict[str, Any]) -> dict[str, Any]:
         # Create schedules
         for key in ["learning_rate", "clip_range", "clip_range_vf", "delta_std"]:
             if key not in hyperparams:
@@ -345,7 +345,7 @@ def _preprocess_schedules(hyperparams: Dict[str, Any]) -> Dict[str, Any]:
                 raise ValueError(f"Invalid value for {key}: {hyperparams[key]}")
         return hyperparams
 
-    def _preprocess_normalization(self, hyperparams: Dict[str, Any]) -> Dict[str, Any]:
+    def _preprocess_normalization(self, hyperparams: dict[str, Any]) -> dict[str, Any]:
         if "normalize" in hyperparams.keys():
             self.normalize = hyperparams["normalize"]
 
@@ -370,8 +370,8 @@ def _preprocess_normalization(self, hyperparams: Dict[str, Any]) -> Dict[str, An
         return hyperparams
 
     def _preprocess_hyperparams(  # noqa: C901
-        self, hyperparams: Dict[str, Any]
-    ) -> Tuple[Dict[str, Any], Optional[Callable], List[BaseCallback], Optional[Callable]]:
+        self, hyperparams: dict[str, Any]
+    ) -> tuple[dict[str, Any], Optional[Callable], list[BaseCallback], Optional[Callable]]:
         self.n_envs = hyperparams.get("n_envs", 1)
 
         if self.verbose > 0:
@@ -448,8 +448,8 @@ def _preprocess_hyperparams(  # noqa: C901
         return hyperparams, env_wrapper, callbacks, vec_env_wrapper
 
     def _preprocess_action_noise(
-        self, hyperparams: Dict[str, Any], saved_hyperparams: Dict[str, Any], env: VecEnv
-    ) -> Dict[str, Any]:
+        self, hyperparams: dict[str, Any], saved_hyperparams: dict[str, Any], env: VecEnv
+    ) -> dict[str, Any]:
         # Parse noise string
         # Note: only off-policy algorithms are supported
         if hyperparams.get("noise_type") is not None:
@@ -667,7 +667,7 @@ def make_env(**kwargs) -> gym.Env:
 
         return env
 
-    def _load_pretrained_agent(self, hyperparams: Dict[str, Any], env: VecEnv) -> BaseAlgorithm:
+    def _load_pretrained_agent(self, hyperparams: dict[str, Any], env: VecEnv) -> BaseAlgorithm:
         # Continue training
         print("Loading pretrained agent")
         # Policy should not be changed

diff --git a/rl_zoo3/hyperparams_opt.py b/rl_zoo3/hyperparams_opt.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict
+from typing import Any
 
 import numpy as np
 import optuna
@@ -8,7 +8,7 @@
 from rl_zoo3 import linear_schedule
 
 
-def sample_ppo_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_ppo_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for PPO hyperparams.
 
@@ -76,7 +76,7 @@ def sample_ppo_params(trial: optuna.Trial, n_actions: int, n_envs: int, addition
     }
 
 
-def sample_ppo_lstm_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_ppo_lstm_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for RecurrentPPO hyperparams.
     uses sample_ppo_params(), this function samples for the policy_kwargs
@@ -98,7 +98,7 @@ def sample_ppo_lstm_params(trial: optuna.Trial, n_actions: int, n_envs: int, add
     return hyperparams
 
 
-def sample_trpo_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_trpo_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for TRPO hyperparams.
 
@@ -165,7 +165,7 @@ def sample_trpo_params(trial: optuna.Trial, n_actions: int, n_envs: int, additio
     }
 
 
-def sample_a2c_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_a2c_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for A2C hyperparams.
 
@@ -229,7 +229,7 @@ def sample_a2c_params(trial: optuna.Trial, n_actions: int, n_envs: int, addition
     }
 
 
-def sample_sac_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_sac_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for SAC hyperparams.
 
@@ -290,7 +290,7 @@ def sample_sac_params(trial: optuna.Trial, n_actions: int, n_envs: int, addition
     return hyperparams
 
 
-def sample_td3_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_td3_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for TD3 hyperparams.
 
@@ -346,7 +346,7 @@ def sample_td3_params(trial: optuna.Trial, n_actions: int, n_envs: int, addition
     return hyperparams
 
 
-def sample_ddpg_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_ddpg_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for DDPG hyperparams.
 
@@ -400,7 +400,7 @@ def sample_ddpg_params(trial: optuna.Trial, n_actions: int, n_envs: int, additio
     return hyperparams
 
 
-def sample_dqn_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_dqn_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for DQN hyperparams.
 
@@ -444,7 +444,7 @@ def sample_dqn_params(trial: optuna.Trial, n_actions: int, n_envs: int, addition
     return hyperparams
 
 
-def sample_her_params(trial: optuna.Trial, hyperparams: Dict[str, Any], her_kwargs: Dict[str, Any]) -> Dict[str, Any]:
+def sample_her_params(trial: optuna.Trial, hyperparams: dict[str, Any], her_kwargs: dict[str, Any]) -> dict[str, Any]:
     """
     Sampler for HerReplayBuffer hyperparams.
 
@@ -461,7 +461,7 @@ def sample_her_params(trial: optuna.Trial, hyperparams: Dict[str, Any], her_kwar
     return hyperparams
 
 
-def sample_tqc_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_tqc_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for TQC hyperparams.
 
@@ -480,7 +480,7 @@ def sample_tqc_params(trial: optuna.Trial, n_actions: int, n_envs: int, addition
     return hyperparams
 
 
-def sample_qrdqn_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_qrdqn_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for QR-DQN hyperparams.
 
@@ -496,7 +496,7 @@ def sample_qrdqn_params(trial: optuna.Trial, n_actions: int, n_envs: int, additi
     return hyperparams
 
 
-def sample_ars_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
+def sample_ars_params(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> dict[str, Any]:
     """
     Sampler for ARS hyperparams.
     :param trial: