diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4503f1b9..0469f78f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -29,24 +29,24 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest", "macOS-latest", "windows-latest"] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] # Only test lowest and highest version on the expensive/slow # macOS and windows runners (UPDATE when supported versions change): exclude: - os: macOS-latest python-version: 3.10 - os: macOS-latest - python-version: 3.9 + python-version: 3.11 - os: windows-latest python-version: 3.10 - os: windows-latest - python-version: 3.9 + python-version: 3.11 steps: - uses: actions/checkout@v3 # More info on options: https://github.com/conda-incubator/setup-miniconda - - uses: mamba-org/provision-with-micromamba@main + - uses: mamba-org/setup-micromamba@main with: environment-file: devtools/conda-envs/test_env.yaml environment-name: test diff --git a/CHANGES b/CHANGES index 7a8ed754..71fd2912 100644 --- a/CHANGES +++ b/CHANGES @@ -14,13 +14,22 @@ The rules for this file: ------------------------------------------------------------------------------ -*/*/2023 hl2500 +*/*/2023 hl2500, xiki-tempula * 2.2.0 - + +Changes + - Require pandas >= 2.1 (PR #340) + - For pandas>=2.1, metadata will be loaded from the parquet file (issue #331, PR #340). + - add support for Python 3.12, remove Python 3.8 support (issue #341, PR #304). + Enhancements - Add a TI estimator using gaussian quadrature to calculate the free energy. (issue #302, PR #304) + - Warning issued when the series is `None` for `statistical_inefficiency` + (issue #337, PR #338) + - ValueError issued when `df` and `series` for `statistical_inefficiency` + doesn't have the same length (issue #337, PR #338) 22/06/2023 xiki-tempula diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 87470b55..a8795f1c 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -4,7 +4,7 @@ channels: dependencies: - python - numpy -- pandas +- pandas>=2.1 - pymbar>=4 - scipy - scikit-learn diff --git a/environment.yml b/environment.yml index 4d2d9bda..025ff953 100644 --- a/environment.yml +++ b/environment.yml @@ -4,9 +4,10 @@ channels: dependencies: - python - numpy -- pandas +- pandas>=2.1 - pymbar>=4 - scipy - scikit-learn - pyarrow - matplotlib +- loguru diff --git a/readthedocs.yml b/readthedocs.yml index 71851b19..0c61d50f 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -13,7 +13,7 @@ build: python: "mambaforge-4.10" conda: - environment: environment.yml + environment: devtools/conda-envs/test_env.yaml python: install: diff --git a/setup.py b/setup.py index 2dbfed19..67df7d72 100755 --- a/setup.py +++ b/setup.py @@ -29,10 +29,10 @@ "Operating System :: Microsoft :: Windows ", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Scientific/Engineering :: Chemistry", @@ -43,11 +43,11 @@ license="BSD", long_description=open("README.rst").read(), long_description_content_type="text/x-rst", - python_requires=">=3.8", + python_requires=">=3.9", tests_require=["pytest", "alchemtest"], install_requires=[ "numpy", - "pandas>=1.4", + "pandas>=2.1", "pymbar>=4", "scipy", "scikit-learn", diff --git a/src/alchemlyb/parsing/parquet.py b/src/alchemlyb/parsing/parquet.py index 180817ae..8f9fc408 100644 --- a/src/alchemlyb/parsing/parquet.py +++ b/src/alchemlyb/parsing/parquet.py @@ -1,9 +1,42 @@ import pandas as pd +from loguru import logger from . import _init_attrs -@_init_attrs +def _read_parquet_with_metadata(path: str, T: float) -> pd.DataFrame: + """ + Check if the metadata is included in the Dataframe and has the correct + temperature. + + Parameters + ---------- + path : str + Path to parquet file to extract dataframe from. + T : float + Temperature in Kelvin of the simulations. + + Returns + ------- + DataFrame + """ + df = pd.read_parquet(path) + if "temperature" not in df.attrs: + logger.warning( + f"No temperature metadata found in {path}. " + f"Serialise the Dataframe with pandas>=2.1 to preserve the metadata." + ) + df.attrs["temperature"] = T + df.attrs["energy_unit"] = "kT" + else: + if df.attrs["temperature"] != T: + raise ValueError( + f"Temperature in the input ({T}) doesn't match the temperature " + f"in the dataframe ({df.attrs['temperature']})." + ) + return df + + def extract_u_nk(path, T): r"""Return reduced potentials `u_nk` (unit: kT) from a pandas parquet file. @@ -36,7 +69,7 @@ def extract_u_nk(path, T): .. versionadded:: 2.1.0 """ - u_nk = pd.read_parquet(path) + u_nk = _read_parquet_with_metadata(path, T) columns = list(u_nk.columns) if isinstance(columns[0], str) and columns[0][0] == "(": new_columns = [] @@ -81,4 +114,4 @@ def extract_dHdl(path, T): .. versionadded:: 2.1.0 """ - return pd.read_parquet(path) + return _read_parquet_with_metadata(path, T) diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index 4633a87e..92813375 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -363,6 +363,15 @@ def _prepare_input(df, series, drop_duplicates, sort): series : Series Formatted Series. """ + if series is None: + warnings.warn( + "The series input is `None`, would not subsample according to statistical inefficiency." + ) + + elif len(df) != len(series): + raise ValueError( + f"The length of df ({len(df)}) should be same as the length of series ({len(series)})." + ) if _check_multiple_times(df): if drop_duplicates: df, series = _drop_duplicates(df, series) diff --git a/src/alchemlyb/tests/conftest.py b/src/alchemlyb/tests/conftest.py index d7a3749d..ac216471 100644 --- a/src/alchemlyb/tests/conftest.py +++ b/src/alchemlyb/tests/conftest.py @@ -82,7 +82,7 @@ def gmx_ABFE(): @pytest.fixture -def gmx_ABFE_complex_n_uk(gmx_ABFE): +def gmx_ABFE_complex_u_nk(gmx_ABFE): return [gmx.extract_u_nk(file, T=300) for file in gmx_ABFE["complex"]] diff --git a/src/alchemlyb/tests/parsing/test_parquet.py b/src/alchemlyb/tests/parsing/test_parquet.py index a2d788f6..c42cccb6 100644 --- a/src/alchemlyb/tests/parsing/test_parquet.py +++ b/src/alchemlyb/tests/parsing/test_parquet.py @@ -12,12 +12,45 @@ def test_extract_dHdl(dHdl_list, request, tmp_path): new_dHdl = extract_dHdl(str(tmp_path / "dhdl.parquet"), T=300) assert (new_dHdl.columns == dHdl.columns).all() assert (new_dHdl.index == dHdl.index).all() + assert new_dHdl.attrs["temperature"] == 300 + assert new_dHdl.attrs["energy_unit"] == "kT" -@pytest.mark.parametrize("u_nk_list", ["gmx_benzene_VDW_u_nk", "gmx_ABFE_complex_n_uk"]) +@pytest.mark.parametrize("u_nk_list", ["gmx_benzene_VDW_u_nk", "gmx_ABFE_complex_u_nk"]) def test_extract_dHdl(u_nk_list, request, tmp_path): u_nk = request.getfixturevalue(u_nk_list)[0] u_nk.to_parquet(path=str(tmp_path / "u_nk.parquet"), index=True) new_u_nk = extract_u_nk(str(tmp_path / "u_nk.parquet"), T=300) assert (new_u_nk.columns == u_nk.columns).all() assert (new_u_nk.index == u_nk.index).all() + assert new_u_nk.attrs["temperature"] == 300 + assert new_u_nk.attrs["energy_unit"] == "kT" + + +@pytest.fixture() +def u_nk(gmx_ABFE_complex_u_nk): + return gmx_ABFE_complex_u_nk[0] + + +def test_no_T(u_nk, tmp_path, caplog): + u_nk.attrs = {} + u_nk.to_parquet(path=str(tmp_path / "temp.parquet"), index=True) + extract_u_nk(str(tmp_path / "temp.parquet"), 300) + assert ( + "Serialise the Dataframe with pandas>=2.1 to preserve the metadata." + in caplog.text + ) + + +def test_wrong_T(u_nk, tmp_path, caplog): + u_nk.to_parquet(path=str(tmp_path / "temp.parquet"), index=True) + with pytest.raises(ValueError, match="doesn't match the temperature"): + extract_u_nk(str(tmp_path / "temp.parquet"), 400) + + +def test_metadata_unchanged(u_nk, tmp_path): + u_nk.attrs = {"temperature": 400, "energy_unit": "kcal/mol"} + u_nk.to_parquet(path=str(tmp_path / "temp.parquet"), index=True) + new_u_nk = extract_u_nk(str(tmp_path / "temp.parquet"), 400) + assert new_u_nk.attrs["temperature"] == 400 + assert new_u_nk.attrs["energy_unit"] == "kcal/mol" diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 00e3c030..d64c7267 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -41,8 +41,8 @@ def u_nk(gmx_benzene_Coulomb_u_nk): @pytest.fixture() -def multi_index_u_nk(gmx_ABFE_complex_n_uk): - return gmx_ABFE_complex_n_uk[0] +def multi_index_u_nk(gmx_ABFE_complex_u_nk): + return gmx_ABFE_complex_u_nk[0] @pytest.fixture() @@ -470,7 +470,7 @@ def test_decorrelate_dhdl_multiple_l(multi_index_dHdl): ) -def test_raise_non_uk(multi_index_dHdl): +def test_raise_nou_nk(multi_index_dHdl): with pytest.raises(ValueError): decorrelate_u_nk( multi_index_dHdl, @@ -544,3 +544,16 @@ def test_statistical_inefficiency(self, caplog, u_nk): assert "Running statistical inefficiency analysis." in caplog.text assert "Statistical inefficiency:" in caplog.text assert "Number of uncorrelated samples:" in caplog.text + + +def test_unequil_input(dHdl): + with pytest.raises(ValueError, match="should be same as the length of series"): + statistical_inefficiency(dHdl, series=dHdl[:10]) + + +def test_series_none(dHdl): + with pytest.warns( + UserWarning, + match="The series input is `None`, would not subsample according to statistical inefficiency.", + ): + statistical_inefficiency(dHdl, series=None) diff --git a/src/alchemlyb/tests/test_workflow_ABFE.py b/src/alchemlyb/tests/test_workflow_ABFE.py index 82cc83ab..c2dc4b15 100644 --- a/src/alchemlyb/tests/test_workflow_ABFE.py +++ b/src/alchemlyb/tests/test_workflow_ABFE.py @@ -258,7 +258,7 @@ def test_single_estimator_ti(self, workflow, monkeypatch): summary = workflow.generate_result() assert np.isclose(summary["TI"]["Stages"]["TOTAL"], 21.51472826028906, 0.1) - def test_unprocessed_n_uk(self, workflow, monkeypatch): + def test_unprocessed_u_nk(self, workflow, monkeypatch): monkeypatch.setattr(workflow, "u_nk_sample_list", None) monkeypatch.setattr(workflow, "estimator", dict()) workflow.estimate()