Skip to content

Commit

Permalink
Merge branch 'explicit-patching'.
Browse files Browse the repository at this point in the history
This changes the default behaviour of importing `b2h5py` to need explicit
patching of the `h5py.Dataset` class either by calling the function
`b2h5py.enable_fast_slicing()` or by using the `b2h5py.fast_slicing()` context
manager.  The old behaviour can still be obtained by explicitly importing
`b2h5py.auto`.

Documentation and tests have been updated accordingly.
  • Loading branch information
ivilata committed Dec 13, 2023
2 parents 0cd283e + 2d47f23 commit 229d25e
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 64 deletions.
9 changes: 6 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,18 @@ Benchmarks of this technique show 2x-5x speed-ups compared with normal filter-ba
Usage
-----

This optimized access works for slices with step 1 on Blosc2-compressed datasets using the native byte order. It is enabled by monkey-patching the ``h5py.Dataset`` class to extend the slicing operation. This is done on module import, so the only thing you need to do is::
This optimized access works for slices with step 1 on Blosc2-compressed datasets using the native byte order. It is enabled by monkey-patching the ``h5py.Dataset`` class to extend the slicing operation. The easiest way to do this is::

import b2h5py
import b2h5py.auto

After that, optimization will be attempted for any slicing of a dataset (of the form ``dataset[...]`` or ``dataset.__getitem__(...)``). If the optimization is not possible in a particular case, normal h5py slicing code will be used (which performs HDF5 filter-based access, backed by hdf5plugin_ to support Blosc2).

.. _hdf5plugin: https://github.com/silx-kit/hdf5plugin

You may globally disable the optimization after importing ``b2h5py`` by calling ``b2h5py.disable_fast_slicing()``, and enable it again with ``b2h5py.enable_fast_slicing()``. You may also enable it temporarily by using ``b2h5py.fast_slicing()`` to get a context manager.
You may instead just ``import b2h5py`` and explicitly enable the optimization globally by calling ``b2h5py.enable_fast_slicing()``, and disable it again with ``b2h5py.disable_fast_slicing()``. You may also enable it temporarily by using a context manager::

with b2h5py.fast_slicing():
# ... code that will use Blosc2 optimized slicing ...

Building
--------
Expand Down
21 changes: 9 additions & 12 deletions b2h5py/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
Optimizations are applied to slices of the form ``dataset[...]`` or
``dataset.__getitem__(...)`` with step 1 on Blosc2-compressed datasets using
the native byte order.
the native byte order. They are implemented by monkey-patching the
``h5py.Dataset`` class.
They are enabled automatically on module import, by monkey-patching the
``h5py.Dataset`` class. You may explicitly undo this patching and deactivate
optimization globally with `disable_fast_slicing()` and redo it and activate
it again with `enable_fast_slicing()`. You may also patch the class and
activate optimization temporarily using `fast_slicing()` to get a context
manager.
Optimizations need to be enabled explicitly. One option is to call
`enable_fast_slicing()` to enable them globally (by performing the patching).
Then `disable_fast_slicing()` may be called to disable them again (by undoing
the patching). As an alternative, you may also activate optimizations
temporarily using `fast_slicing()` to get a context manager.
**Note:** For testing and debugging purposes, you may force-disable the
optimization at any time by setting ``BLOSC2_FILTER=1`` in the environment.
**Note:** For testing and debugging purposes, you may force-disable
optimizations at any time by setting ``BLOSC2_FILTER=1`` in the environment.
"""

from .blosc2 import (disable_fast_slicing,
Expand All @@ -25,6 +25,3 @@
'enable_fast_slicing',
'fast_slicing',
'is_fast_slicing_enabled']


enable_fast_slicing()
14 changes: 14 additions & 0 deletions b2h5py/auto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Automatic activation of Blosc2 optimized slicing for h5py.
Importing this module enables the optimization globally, just use::
import b2h5py.auto
After that, all slicing operations on Blosc2-compressed datasets will be
transparently optimized when possible.
"""

from .blosc2 import enable_fast_slicing


enable_fast_slicing()
50 changes: 23 additions & 27 deletions b2h5py/tests/test_dataset_patching.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,29 @@
import contextlib
import functools

import b2h5py # monkey-patches h5py.Dataset
import b2h5py

from h5py import Dataset
from h5py.tests.common import TestCase


class Blosc2DatasetPatchingTestCase(TestCase):
def setUp(self):
super().setUp()
b2h5py.enable_fast_slicing()

def tearDown(self):
b2h5py.enable_fast_slicing()
b2h5py.disable_fast_slicing()
super().tearDown()

def test_default(self):
"""Dataset class is patched by default"""
self.assertTrue(b2h5py.is_fast_slicing_enabled())

def test_unpatch_patch(self):
"""Unpatching and patching dataset class again"""
b2h5py.disable_fast_slicing()
def test_00default(self):
"""Dataset class is not patched by default"""
self.assertFalse(b2h5py.is_fast_slicing_enabled())

def test_patch_unpatch(self):
"""Patching and unpatching dataset class again"""
b2h5py.enable_fast_slicing()
self.assertTrue(b2h5py.is_fast_slicing_enabled())

b2h5py.disable_fast_slicing()
self.assertFalse(b2h5py.is_fast_slicing_enabled())

def test_patch_again(self):
"""Patching the dataset class twice"""
b2h5py.enable_fast_slicing()
Expand All @@ -53,8 +49,6 @@ def test_unpatch_again(self):

def test_patch_patched(self):
"""Patching when already patched by someone else"""
b2h5py.disable_fast_slicing()

@functools.wraps(Dataset.__getitem__)
def foreign_getitem(self, args, new_dtype=None):
return 42
Expand All @@ -75,6 +69,7 @@ def foreign_getitem(self, args, new_dtype=None):

def test_unpatch_foreign(self):
"""Unpatching when patched over by someone else"""
b2h5py.enable_fast_slicing()

@functools.wraps(Dataset.__getitem__)
def foreign_getitem(self, args, new_dtype=None):
Expand All @@ -88,6 +83,12 @@ def foreign_getitem(self, args, new_dtype=None):
finally:
Dataset.__getitem__ = foreign_getitem.__wrapped__

def test_auto(self):
"""Patching on importing auto module"""
self.assertFalse(b2h5py.is_fast_slicing_enabled())
import b2h5py.auto as b2a
self.assertTrue(b2h5py.is_fast_slicing_enabled())


class CMTestError(Exception):
pass
Expand All @@ -98,14 +99,6 @@ class ContextManagerTestCase(TestCase):

shall_raise = False

def setUp(self):
super().setUp()
b2h5py.disable_fast_slicing()

def tearDown(self):
b2h5py.enable_fast_slicing()
super().tearDown()

def patching_cmgr(self):
"""Checks for error if `self.shall_raise`, patches dataset class"""
test_case = self
Expand All @@ -123,7 +116,7 @@ def maybe_raise(self):
if self.shall_raise:
raise CMTestError

def test_default(self):
def test_00default(self):
"""Dataset class is patched then unpatched"""
self.assertFalse(b2h5py.is_fast_slicing_enabled())
with self.patching_cmgr():
Expand All @@ -142,10 +135,13 @@ def test_already_patched(self):
"""Not unpatching if already patched before entry"""
b2h5py.enable_fast_slicing()
self.assertTrue(b2h5py.is_fast_slicing_enabled())
with self.patching_cmgr():
try:
with self.patching_cmgr():
self.assertTrue(b2h5py.is_fast_slicing_enabled())
self.maybe_raise()
self.assertTrue(b2h5py.is_fast_slicing_enabled())
self.maybe_raise()
self.assertTrue(b2h5py.is_fast_slicing_enabled())
finally:
b2h5py.disable_fast_slicing()

def test_nested(self):
"""Nesting patching context managers"""
Expand Down
3 changes: 2 additions & 1 deletion b2h5py/tests/test_patched_h5py.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def run_h5py_tests():
os.path.dirname(h5py.tests.__file__),
top_level_dir=os.path.dirname(os.path.dirname(h5py.__file__)))
test_runner = unittest.TextTestRunner()
test_runner.run(test_suite)
with b2h5py.fast_slicing():
test_runner.run(test_suite)


if __name__ == '__main__':
Expand Down
25 changes: 18 additions & 7 deletions b2h5py/tests/test_slicing_blosc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import functools
import random

import b2h5py # monkey-patches h5py.Dataset
import b2h5py
import hdf5plugin as h5p
import numpy as np

Expand Down Expand Up @@ -42,6 +42,9 @@ def check_opt_slicing(test):
def checked_test(self):
if not self.should_enable_opt():
return test(self)
# If the dataset class is not patched,
# the exception set below is never raised anyway.
self.assertTrue(b2h5py.is_fast_slicing_enabled())
# Force an exception if the optimization is not used.
orig_exc = b2h5py.blosc2._no_opt_error
b2h5py.blosc2._no_opt_error = Blosc2OptNotUsedError
Expand All @@ -53,7 +56,7 @@ def checked_test(self):


class Blosc2OptSlicingTestCase(TestCase, StoreArrayMixin):
"""Blosc2 optimized slicing"""
"""Blosc2 optimized slicing by patching dataset class"""

blosc2_force_filter = False

Expand All @@ -67,8 +70,10 @@ def setUp(self):

self.blosc2_filter_env = os.environ.get('BLOSC2_FILTER', '0')
os.environ['BLOSC2_FILTER'] = '1' if self.blosc2_force_filter else '0'
b2h5py.enable_fast_slicing()

def tearDown(self):
b2h5py.disable_fast_slicing()
os.environ['BLOSC2_FILTER'] = self.blosc2_filter_env
super().tearDown()

Expand Down Expand Up @@ -167,7 +172,7 @@ def test_astype(self):


class Blosc2FiltSlicingTestCase(Blosc2OptSlicingTestCase):
"""Blosc2 filter slicing"""
"""Blosc2 filter slicing forced by environment variable"""

blosc2_force_filter = True

Expand All @@ -179,10 +184,6 @@ def setUp(self):
super().setUp()
b2h5py.disable_fast_slicing()

def tearDown(self):
b2h5py.enable_fast_slicing()
super().tearDown()

def should_enable_opt(self):
return False

Expand Down Expand Up @@ -214,6 +215,11 @@ def setUp(self):
self.chunks = (2, 2, 1)
self.arr = np.arange(np.prod(shape), dtype="u1").reshape(shape)
StoreArrayMixin.setUp(self)
b2h5py.enable_fast_slicing()

def tearDown(self):
b2h5py.disable_fast_slicing()
TestCase().tearDown()

def should_enable_opt(self):
return True
Expand Down Expand Up @@ -246,6 +252,11 @@ def setUp(self):
arr[4, 4] = (9, 9)
self.arr = arr
StoreArrayMixin.setUp(self)
b2h5py.enable_fast_slicing()

def tearDown(self):
b2h5py.disable_fast_slicing()
TestCase().tearDown()

def should_enable_opt(self):
return True
Expand Down
27 changes: 13 additions & 14 deletions examples/blosc2_optimized_slicing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
It creates a 2-dimensional dataset made of different chunks, compressed with
Blosc2. Then it proceeds to slice the dataset in ways that may and may not
benefit from Blosc2 optimized slicing. Some hints about forcing the use of
the HDF5 filter pipeline are included, as well as comments on the Python
package dependencies required for the different use cases.
benefit from Blosc2 optimized slicing. Examples of different ways to enable
Blosc2 optimized slicing are shown.
Optimized slicing can provide considerable speed-ups in certain use cases,
please see `this benchmark`__ which evaluates applying the same technique in
Expand Down Expand Up @@ -66,11 +65,11 @@ def printl(*args, **kwargs):

# Benefitting from Blosc2 optimized slicing
# -----------------------------------------
# After importing `b2h5py`,
# After importing `b2h5py.auto`,
# support for Blosc2 optimized slicing is enabled by default.
print("# Using Blosc2 optimized slicing")
with h5py.File(file_name, 'r') as f:
import b2h5py
import b2h5py.auto
assert(b2h5py.is_fast_slicing_enabled())
# One just uses slicing as usual.
dataset = f[dataset_name]
Expand All @@ -83,21 +82,22 @@ def printl(*args, **kwargs):
printl("Sparse slice from dataset (filter):", dataset[150::2, 150::2])
printl("Sparse slice from input array:", data[150::2, 150::2])
print()
b2h5py.disable_fast_slicing() # back to normal

# Disabling Blosc2 optimized slicing
# ----------------------------------
# Enabling Blosc2 optimized slicing
# ---------------------------------
# Utility functions are provided to enable and disable optimization globally.
print("# Disabling Blosc2 optimized slicing globally")
print("# Enabling Blosc2 optimized slicing globally")
with h5py.File(file_name, 'r') as f:
import b2h5py
assert(b2h5py.is_fast_slicing_enabled())
b2h5py.disable_fast_slicing()
assert(not b2h5py.is_fast_slicing_enabled())
b2h5py.enable_fast_slicing()
assert(b2h5py.is_fast_slicing_enabled())
dataset = f[dataset_name]
printl("Slice from dataset (filter):", dataset[150:, 150:])
printl("Slice from dataset (optimized):", dataset[150:, 150:])
printl("Slice from input array:", data[150:, 150:])
b2h5py.enable_fast_slicing() # back to normal
assert(b2h5py.is_fast_slicing_enabled())
b2h5py.disable_fast_slicing() # back to normal
assert(not b2h5py.is_fast_slicing_enabled())
print()

# Enabling Blosc2 optimized slicing temporarily
Expand All @@ -107,7 +107,6 @@ def printl(*args, **kwargs):
print("# Enabling Blosc2 optimized slicing temporarily")
with h5py.File(file_name, 'r') as f:
import b2h5py
b2h5py.disable_fast_slicing()
assert(not b2h5py.is_fast_slicing_enabled())
dataset = f[dataset_name]
printl("Slice from dataset (filter):", dataset[150:, 150:])
Expand Down

0 comments on commit 229d25e

Please sign in to comment.