From 12089baae54f3617e003b1ee9ee417c41e56f2d4 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sun, 22 Sep 2024 13:14:39 -0700 Subject: [PATCH 01/10] paper first draft --- joss_paper/paper.bib | 103 +++++++++++++++++++++++++++++++++++++++++++ joss_paper/paper.md | 85 +++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 joss_paper/paper.bib create mode 100644 joss_paper/paper.md diff --git a/joss_paper/paper.bib b/joss_paper/paper.bib new file mode 100644 index 0000000..5551316 --- /dev/null +++ b/joss_paper/paper.bib @@ -0,0 +1,103 @@ + +@article{H5MD:2014, + title = {H5MD: A structured, efficient, and portable file format for molecular data}, + journal = {Computer Physics Communications}, + volume = {185}, + number = {6}, + pages = {1546-1553}, + year = {2014}, + issn = {0010-4655}, + doi = {https://doi.org/10.1016/j.cpc.2014.01.018}, + url = {https://www.sciencedirect.com/science/article/pii/S0010465514000447}, + author = {Pierre {de Buyl} and Peter H. Colberg and Felix Höfling}, + keywords = {Molecular simulation, HDF5}, + abstract = {We propose a new file format named “H5MD” for storing molecular simulation data, such as trajectories of particle positions and velocities, along with thermodynamic observables that are monitored during the course of the simulation. H5MD files are HDF5 (Hierarchical Data Format) files with a specific hierarchy and naming scheme. Thus, H5MD inherits many benefits of HDF5, e.g., structured layout of multi-dimensional datasets, data compression, fast and parallel I/O, and portability across many programming languages and hardware platforms. H5MD files are self-contained, and foster the reproducibility of scientific data and the interchange of data between researchers using different simulation programs and analysis software. In addition, the H5MD specification can serve for other kinds of data (e.g. experimental data) and is extensible to supplemental data, or may be part of an enclosing file structure.} +} + +@inproceedings{H5MDReader:2021, + address = {Austin, TX}, + title = {{MPI}-parallel {Molecular} {Dynamics} {Trajectory} {Analysis} with the {H5MD} {Format} in the {MDAnalysis} {Python} {Package}}, + url = {https://conference.scipy.org/proceedings/scipy2021/edis_jakupovic.html}, + doi = {10.25080/majora-1b6fd038-005}, + abstract = {Molecular dynamics (MD) computer simulations help elucidate details of the molecular processes in complex biological systems, from protein dynamics to drug discovery. One major issue is that these MD simulation files are now commonly terabytes in size, which means analyzing the data from these files becomes a painstakingly expensive task. In the age of national supercomputers, methods of parallel analysis are becoming a necessity for the efficient use of time and high performance computing (HPC) resources but for any approach to parallel analysis, simply reading the file from disk becomes the performance bottleneck that limits overall analysis speed. One promising way around this file I/O hurdle is to use a parallel message passing interface (MPI) implementation with the HDF5 (Hierarchical Data Format 5) file format to access a single file simultaneously with numerous processes on a parallel file system. Our previous feasibility study suggested that this combination can lead to favorable parallel scaling with hundreds of CPU cores, so we implemented a fast and user-friendly HDF5 reader (the H5MDReader class) that adheres to H5MD (HDF5 for Molecular Dynamics) specifications. We made H5MDReader (together with a H5MD output class H5MDWriter) available in the MDAnalysis library, a Python package that simplifies the process of reading and writing various popular MD file formats by providing a streamlined user-interface that is independent of any specific file format. We benchmarked H5MDReader's parallel file reading capabilities on three HPC clusters: ASU Agave, SDSC Comet, and PSC Bridges. The benchmark consisted of a simple split-apply-combine scheme of an I/O bound task that split a 90k frame (113 GiB) coordinate trajectory into chunks for processes, where each process performed the commonly used RMSD (root mean square distance after optimal structural superposition) calculation on their chunk of data, and then gathered the results back to the root process. For baseline performance, we found maximum I/O speedups at 2 full nodes, with Agave showing 20x, and a maximum computation speedup on Comet of 373x on 384 cores (all three HPCs scaled well in their computation task). We went on to test a series of optimizations attempting to speed up I/O performance, including adjusting file system stripe count, implementing a masked array feature that only loads relevant data for the computation task, front loading all I/O by loading the entire trajectory into memory, and manually adjusting the HDF5 dataset chunk shapes. We found the largest improvement in I/O performance by optimizing the chunk shape of the HDF5 datasets to match the iterative access pattern of our analysis benchmark. With respect to baseline serial performance, our best result was a 98x speedup at 112 cores on ASU Agave. In terms of absolute time saved, the analysis went from 4623 seconds in the baseline serial run to 47 seconds in the parallel, properly chunked run. Our results emphasize the fact that file I/O is not just dependent on the access pattern of the file, but more so the synergy between access pattern and the layout of the file on disk.}, + urldate = {2021-07-05}, + booktitle = {Proceedings of the 20th {Python} in {Science} {Conference}}, + author = {Jakupovic, Edis and Beckstein, Oliver}, + editor = {Agarwal, Meghann and Calloway, Chris and Niederhut, Dillon and Shupe, David}, + year = {2021}, + pages = {40--48}, +} + +@INPROCEEDINGS{MDAKits:2023, + title = "{MDAKits}: A framework for {FAIR-compliant} molecular + simulation analysis", + booktitle = "Proceedings of the Python in Science Conference", + author = "Alibay, Irfan and Wang, Lily and Naughton, Fiona and Kenney, + Ian and Barnoud, Jonathan and Gowers, Richard and Beckstein, + Oliver", + publisher = "SciPy", + pages = "76--84", + year = 2023, + conference = "Python in Science Conference", + location = "Austin, Texas" +} + + +@InProceedings{MDAnalysis:2016, + author = { {R}ichard {J}. {G}owers and {M}ax {L}inke and {J}onathan {B}arnoud and {T}yler {J}. {E}. {R}eddy and {M}anuel {N}. {M}elo and {S}ean {L}. {S}eyler and {J}an {D}omański and {D}avid {L}. {D}otson and {S}ébastien {B}uchoux and {I}an {M}. {K}enney and {O}liver {B}eckstein }, + title = { {M}{D}{A}nalysis: {A} {P}ython {P}ackage for the {R}apid {A}nalysis of {M}olecular {D}ynamics {S}imulations }, + booktitle = { {P}roceedings of the 15th {P}ython in {S}cience {C}onference }, + pages = { 98 - 105 }, + year = { 2016 }, + editor = { {S}ebastian {B}enthall and {S}cott {R}ostrup }, + doi = { 10.25080/Majora-629e541a-00e } +} + + +@article{MDAnalysis:2011, + author = {Michaud-Agrawal, Naveen and Denning, Elizabeth J. and Woolf, Thomas B. and Beckstein, Oliver}, + title = {MDAnalysis: A toolkit for the analysis of molecular dynamics simulations}, + journal = {Journal of Computational Chemistry}, + volume = {32}, + number = {10}, + pages = {2319-2327}, + keywords = {molecular dynamics simulations, analysis, proteins, object-oriented design, software, membrane systems, Python programming language}, + doi = {https://doi.org/10.1002/jcc.21787}, + url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/jcc.21787}, + eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/jcc.21787}, + abstract = {Abstract MDAnalysis is an object-oriented library for structural and temporal analysis of molecular dynamics (MD) simulation trajectories and individual protein structures. It is written in the Python language with some performance-critical code in C. It uses the powerful NumPy package to expose trajectory data as fast and efficient NumPy arrays. It has been tested on systems of millions of particles. Many common file formats of simulation packages including CHARMM, Gromacs, Amber, and NAMD and the Protein Data Bank format can be read and written. Atoms can be selected with a syntax similar to CHARMM's powerful selection commands. MDAnalysis enables both novice and experienced programmers to rapidly write their own analytical tools and access data stored in trajectories in an easily accessible manner that facilitates interactive explorative analysis. MDAnalysis has been tested on and works for most Unix-based platforms such as Linux and Mac OS X. It is freely available under the GNU General Public License from http://mdanalysis.googlecode.com. © 2011 Wiley Periodicals, Inc. J Comput Chem 2011}, + year = {2011} +} + +@Article{NumPy:2020, + title = {Array programming with {NumPy}}, + author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. + van der Walt and Ralf Gommers and Pauli Virtanen and David + Cournapeau and Eric Wieser and Julian Taylor and Sebastian + Berg and Nathaniel J. Smith and Robert Kern and Matti Picus + and Stephan Hoyer and Marten H. van Kerkwijk and Matthew + Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del + R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre + G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and + Warren Weckesser and Hameer Abbasi and Christoph Gohlke and + Travis E. Oliphant}, + year = {2020}, + month = sep, + journal = {Nature}, + volume = {585}, + number = {7825}, + pages = {357--362}, + doi = {10.1038/s41586-020-2649-2}, + publisher = {Springer Science and Business Media {LLC}}, + url = {https://doi.org/10.1038/s41586-020-2649-2} +} + +@misc{Zarr:2024, + doi = {10.5281/ZENODO.3773449}, + url = {https://zenodo.org/doi/10.5281/zenodo.3773449}, + author = {Alistair Miles, and jakirkham, and M Bussonnier, and Josh Moore, and Dimitri Papadopoulos Orfanos, and Davis Bennett, and David Stansby, and Joe Hamman, and James Bourbeau, and Andrew Fulton, and Gregory Lee, and Ryan Abernathey, and Norman Rzepka, and Zain Patel, and Mads R. B. Kristensen, and Sanket Verma, and Saransh Chopra, and Matthew Rocklin, and AWA BRANDON AWA, and Max Jones, and Martin Durant, and Elliott Sales de Andrade, and Vincent Schut, and raphael dussin, and Shivank Chaudhary, and Chris Barnes, and Juan Nunez-Iglesias, and shikharsg, }, + title = {zarr-developers/zarr-python: v3.0.0-alpha}, + publisher = {Zenodo}, + year = {2024}, + copyright = {Creative Commons Attribution 4.0 International} +} \ No newline at end of file diff --git a/joss_paper/paper.md b/joss_paper/paper.md new file mode 100644 index 0000000..3866fbe --- /dev/null +++ b/joss_paper/paper.md @@ -0,0 +1,85 @@ +--- +title: 'Zarrtraj: A Python package for streaming molecular dynamics trajectories from cloud services' +tags: + - streaming + - molecular-dynamics + - file-format + - mdanalysis + - zarr +authors: + - name: Lawson Woods + orcid: 0009-0003-0713-4167 + affiliation: 1 + - name: Hugo Macdermott-Opeskin + orcid: 0000-0002-7393-7457 + affiliation: 1 + - name: Oliver Beckstein + orcid: 000-0003-1340-0831 + affiliation: 1 + - name: Edis Jakupovic + affiliation: 1 + - name: Yuxuan Zhuang + orcid: 0000-0003-4390-8556 + affiliations: 1 + - name: Richard J Gowers + orcid: 0000-0002-3241-1846 + affiliations: 1 +affiliations: + - name: Placeholder + index: 1 +date: 22 September 2024 +bibliography: paper.bib +--- + +# Summary + +Molecular dynamics simulations provide a microscope into the behavior of +atomic-scale environments otherwise prohibitively diffult to observe, however, +the resulting trajectory data is too often siloed in a single institutions' +HPC environment, rendering it unusable by the broader scientific community. +Zarrtraj enables these trajectories to be read directly from cloud storage providers +like AWS, Google Cloud, and Microsoft Azure into MDAnalysis, a popular Python +package for analyzing trajectory data, providing a method to open up access to +trajectory data to anyone with an internet connection. + +# Statement of need + +The computing power in HPC environments has increased to the point where +running simulation algorithms is often no longer the constraint in obtaining +molecular dynamics trajectory data for analysis. Instead, the speed of writing to disk +the ability to share generated data provide new constraints on research in this field. +While exposing download links on the open internet offers one solution this problem, +molecular dynamics trajectories are often massive files which are slow to download and expensive +to store at scale, so a solution which could prevent this duplication of storage and uneccessary +download step would be more ideal. + +Enter `Zarrtraj`, an `MDAnalysis` [@MDAnalysis:2016] `MDAKit` [@MDAKits:2023] which enables +streaming these trajectories from AWS S3, Google Cloud Buckets, and Azure Blob Storage and Data +Lakes without ever downloading them using the standard `MDAnalysis` trajectory reader API. +This is possible thanks to the `Zarr` [Zarr:2024] package which allows streaming array-like +data from a variety of storage mediums and `Kerchunk`, which extends the capability of `Zarr` +by allowing it to read `HDF5` files in addition to `Zarr` files. Trajectory data can be streamed +in the `H5MD` format [@H5MD:2014], which builds on top of `HDF5`, and the experimental `ZarrMD` format, +which ports `H5MD` to the `Zarr` filetype. This work builds on the existing `MDAnalysis` `H5MDReader` +[@H5MDReader:2021], and similarly uses `NumPy` [@NumPy:2020] as a common interface in-between `MDAnalysis` +and the file storage medium. + + + +# Acknowledgements +Thank you to Google for supporting the Google Summer of Code program (GSoC) which provided +financial support for this project. Thank you to Dr. Hugo MacDermott-Opeskin and Dr. Yuxuan Zhuang +for their mentorship and feedback and to Dr. Jenna Swarthout Goddard for supporting the GSoC program at MDAnalysis. +Thank you to Dr. Oliver Beckstein and Edis Jakupovic for lending their expertise in H5MD and all things MDAnalysis. +Finally, thanks to Martin Durant, author of Kerchunk, for helping refine and merge features in his upstream codebase +necessary for this project. + +# References \ No newline at end of file From 502821c9bcada070b3bebad24b9347ed2ca4df8d Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sun, 22 Sep 2024 13:16:33 -0700 Subject: [PATCH 02/10] typo --- joss_paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/joss_paper/paper.md b/joss_paper/paper.md index 3866fbe..467195f 100644 --- a/joss_paper/paper.md +++ b/joss_paper/paper.md @@ -46,7 +46,7 @@ trajectory data to anyone with an internet connection. The computing power in HPC environments has increased to the point where running simulation algorithms is often no longer the constraint in obtaining -molecular dynamics trajectory data for analysis. Instead, the speed of writing to disk +molecular dynamics trajectory data for analysis. Instead, the speed of writing to disk and the ability to share generated data provide new constraints on research in this field. While exposing download links on the open internet offers one solution this problem, molecular dynamics trajectories are often massive files which are slow to download and expensive From b7a74f382abd9aa35867f0c50a8dcd2a09a7bb29 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sun, 22 Sep 2024 13:17:07 -0700 Subject: [PATCH 03/10] typo --- joss_paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/joss_paper/paper.md b/joss_paper/paper.md index 467195f..83d5c89 100644 --- a/joss_paper/paper.md +++ b/joss_paper/paper.md @@ -56,7 +56,7 @@ download step would be more ideal. Enter `Zarrtraj`, an `MDAnalysis` [@MDAnalysis:2016] `MDAKit` [@MDAKits:2023] which enables streaming these trajectories from AWS S3, Google Cloud Buckets, and Azure Blob Storage and Data Lakes without ever downloading them using the standard `MDAnalysis` trajectory reader API. -This is possible thanks to the `Zarr` [Zarr:2024] package which allows streaming array-like +This is possible thanks to the `Zarr` [@Zarr:2024] package which allows streaming array-like data from a variety of storage mediums and `Kerchunk`, which extends the capability of `Zarr` by allowing it to read `HDF5` files in addition to `Zarr` files. Trajectory data can be streamed in the `H5MD` format [@H5MD:2014], which builds on top of `HDF5`, and the experimental `ZarrMD` format, From 7ff2ef314b1bdc5eca5b492c2a567126ede582d7 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sun, 22 Sep 2024 13:25:41 -0700 Subject: [PATCH 04/10] spelling --- joss_paper/paper.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/joss_paper/paper.md b/joss_paper/paper.md index 83d5c89..81347aa 100644 --- a/joss_paper/paper.md +++ b/joss_paper/paper.md @@ -34,7 +34,7 @@ bibliography: paper.bib # Summary Molecular dynamics simulations provide a microscope into the behavior of -atomic-scale environments otherwise prohibitively diffult to observe, however, +atomic-scale environments otherwise prohibitively difficult to observe, however, the resulting trajectory data is too often siloed in a single institutions' HPC environment, rendering it unusable by the broader scientific community. Zarrtraj enables these trajectories to be read directly from cloud storage providers @@ -50,7 +50,7 @@ molecular dynamics trajectory data for analysis. Instead, the speed of writing t the ability to share generated data provide new constraints on research in this field. While exposing download links on the open internet offers one solution this problem, molecular dynamics trajectories are often massive files which are slow to download and expensive -to store at scale, so a solution which could prevent this duplication of storage and uneccessary +to store at scale, so a solution which could prevent this duplication of storage and unnecessary download step would be more ideal. Enter `Zarrtraj`, an `MDAnalysis` [@MDAnalysis:2016] `MDAKit` [@MDAKits:2023] which enables From 7c9cd2c074f824cec33910db4abe2a466111f83b Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Thu, 26 Sep 2024 12:13:05 -0700 Subject: [PATCH 05/10] revisions --- joss_paper/paper.bib | 77 ++++++++++++++++++++++++++++++++- joss_paper/paper.md | 100 +++++++++++++++++++++++++++++-------------- 2 files changed, 144 insertions(+), 33 deletions(-) diff --git a/joss_paper/paper.bib b/joss_paper/paper.bib index 5551316..56e687e 100644 --- a/joss_paper/paper.bib +++ b/joss_paper/paper.bib @@ -1,3 +1,23 @@ +@article{FAIR:2019, + title = {Make scientific data FAIR}, + volume = {570}, + ISSN = {1476-4687}, + url = {http://dx.doi.org/10.1038/d41586-019-01720-7}, + DOI = {10.1038/d41586-019-01720-7}, + number = {7759}, + journal = {Nature}, + publisher = {Springer Science and Business Media LLC}, + author = {Stall, Shelley and Yarmey, Lynn and Cutcher-Gershenfeld, Joel and Hanson, Brooks and Lehnert, Kerstin and Nosek, Brian and Parsons, Mark and Robinson, Erin and Wyborn, Lesley}, + year = {2019}, + month = jun, + pages = {27–29} +} + +@misc{FoldingAtHome:2020, + title = {Foldingathome COVID-19 Datasets}, + url = {https://registry.opendata.aws/foldingathome-covid19}, + note = {Accessed: September 25, 2024} +} @article{H5MD:2014, title = {H5MD: A structured, efficient, and portable file format for molecular data}, @@ -69,6 +89,26 @@ @article{MDAnalysis:2011 year = {2011} } +@article {MDverse:2024, + article_type = {journal}, + title = {MDverse, shedding light on the dark matter of molecular dynamics simulations}, + author = {Tiemann, Johanna KS and Szczuka, Magdalena and Bouarroudj, Lisa and Oussaren, Mohamed and Garcia, Steven and Howard, Rebecca J and Delemotte, Lucie and Lindahl, Erik and Baaden, Marc and Lindorff-Larsen, Kresten and Chavent, Matthieu and Poulain, Pierre}, + editor = {Haider, Shozeb and Cui, Qiang}, + volume = 12, + year = 2024, + month = {aug}, + pub_date = {2024-08-30}, + pages = {RP90061}, + citation = {eLife 2024;12:RP90061}, + doi = {10.7554/eLife.90061}, + url = {https://doi.org/10.7554/eLife.90061}, + abstract = {The rise of open science and the absence of a global dedicated data repository for molecular dynamics (MD) simulations has led to the accumulation of MD files in generalist data repositories, constituting the \textit{dark matter of MD} — data that is technically accessible, but neither indexed, curated, or easily searchable. Leveraging an original search strategy, we found and indexed about 250,000 files and 2000 datasets from Zenodo, Figshare and Open Science Framework. With a focus on files produced by the Gromacs MD software, we illustrate the potential offered by the mining of publicly available MD data. We identified systems with specific molecular composition and were able to characterize essential parameters of MD simulation such as temperature and simulation length, and could identify model resolution, such as all-atom and coarse-grain. Based on this analysis, we inferred metadata to propose a search engine prototype to explore the MD data. To continue in this direction, we call on the community to pursue the effort of sharing MD data, and to report and standardize metadata to reuse this valuable matter.}, + keywords = {molecular dynamics, simulation, modeling, FAIR}, + journal = {eLife}, + issn = {2050-084X}, + publisher = {eLife Sciences Publications, Ltd}, +} + @Article{NumPy:2020, title = {Array programming with {NumPy}}, author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. @@ -92,6 +132,40 @@ @Article{NumPy:2020 url = {https://doi.org/10.1038/s41586-020-2649-2} } +@ARTICLE{PANGEO:2022, + AUTHOR={Stern, Charles and Abernathey, Ryan and Hamman, Joseph and Wegener, Rachel and Lepore, Chiara and Harkins, Sean and Merose, Alexander }, + + TITLE={Pangeo Forge: Crowdsourcing Analysis-Ready, Cloud Optimized Data Production}, + + JOURNAL={Frontiers in Climate}, + + VOLUME={3}, + + YEAR={2022}, + + URL={https://www.frontiersin.org/journals/climate/articles/10.3389/fclim.2021.782909}, + + DOI={10.3389/fclim.2021.782909}, + + ISSN={2624-9553}, + + ABSTRACT={

Pangeo Forge is a new community-driven platform that accelerates science by providing high-level recipe frameworks alongside cloud compute infrastructure for extracting data from provider archives, transforming it into analysis-ready, cloud-optimized (ARCO) data stores, and providing a human- and machine-readable catalog for browsing and loading. In abstracting the scientific domain logic of data recipes from cloud infrastructure concerns, Pangeo Forge aims to open a door for a broader community of scientists to participate in ARCO data production. A wholly open-source platform composed of multiple modular components, Pangeo Forge presents a foundation for the practice of reproducible, cloud-native, big-data ocean, weather, and climate science without relying on proprietary or cloud-vendor-specific tooling.

} +} + +@inproceedings{ParallelAnalysis:2010, + author = {Tu, Tiankai and Rendleman, Charles A. and Miller, Patrick J. and Sacerdoti, Federico and Dror, Ron O. and Shaw, David E.}, + title = {Accelerating parallel analysis of scientific simulation data via Zazen}, + year = {2010}, + publisher = {USENIX Association}, + address = {USA}, + abstract = {As a new generation of parallel supercomputers enables researchers to conduct scientific simulations of unprecedented scale and resolution, terabyte-scale simulation output has become increasingly commonplace. Analysis of such massive data sets is typically I/O-bound: many parallel analysis programs spend most of their execution time reading data from disk rather than performing useful computation. To overcome this I/O bottleneck, we have developed a new data access method. Our main idea is to cache a copy of simulation output files on the local disks of an analysis cluster's compute nodes, and to use a novel task-assignment protocol to co-locate data access with computation. We have implemented our methodology in a parallel disk cache system called Zazen. By avoiding the overhead associated with querying metadata servers and by reading data in parallel from local disks, Zazen is able to deliver a sustained read bandwidth of over 20 gigabytes per second on a commodity Linux cluster with 100 nodes, approaching the optimal aggregated I/O bandwidth attainable on these nodes. Compared with conventional NFS, PVFS2, and Hadoop/HDFS, respectively, Zazen is 75, 18, and 6 times faster for accessing large (1-GB) files, and 25, 13, and 85 times faster for accessing small (2-MB) files. We have deployed Zazen in conjunction with Anton--a special-purpose supercomputer that dramatically accelerates molecular dynamics (MD) simulations-- and have been able to accelerate the parallel analysis of terabyte-scale MD trajectories by about an order of magnitude.}, + booktitle = {Proceedings of the 8th USENIX Conference on File and Storage Technologies}, + pages = {10}, + numpages = {1}, + location = {San Jose, California}, + series = {FAST'10} +} + @misc{Zarr:2024, doi = {10.5281/ZENODO.3773449}, url = {https://zenodo.org/doi/10.5281/zenodo.3773449}, @@ -100,4 +174,5 @@ @misc{Zarr:2024 publisher = {Zenodo}, year = {2024}, copyright = {Creative Commons Attribution 4.0 International} -} \ No newline at end of file +} + diff --git a/joss_paper/paper.md b/joss_paper/paper.md index 81347aa..ecc248a 100644 --- a/joss_paper/paper.md +++ b/joss_paper/paper.md @@ -33,53 +33,89 @@ bibliography: paper.bib # Summary -Molecular dynamics simulations provide a microscope into the behavior of +Molecular dynamics (MD) simulations provide a microscope into the behavior of atomic-scale environments otherwise prohibitively difficult to observe, however, the resulting trajectory data is too often siloed in a single institutions' HPC environment, rendering it unusable by the broader scientific community. Zarrtraj enables these trajectories to be read directly from cloud storage providers like AWS, Google Cloud, and Microsoft Azure into MDAnalysis, a popular Python package for analyzing trajectory data, providing a method to open up access to -trajectory data to anyone with an internet connection. +trajectory data to anyone with an internet connection. Enabling cloud streaming +for MD trajectories empowers easier replication of published analysis results, +analyses of large, conglomerate datasets from different sources, and training +machine learning models without downloading and storing trajectory data. # Statement of need The computing power in HPC environments has increased to the point where -running simulation algorithms is often no longer the constraint in obtaining -molecular dynamics trajectory data for analysis. Instead, the speed of writing to disk and -the ability to share generated data provide new constraints on research in this field. -While exposing download links on the open internet offers one solution this problem, -molecular dynamics trajectories are often massive files which are slow to download and expensive -to store at scale, so a solution which could prevent this duplication of storage and unnecessary -download step would be more ideal. +running simulation algorithms is often no longer the constraint in +obtaining scientific insights from molecular dynamics trajectory data. +Instead, the ability to process, analyze and share large volumes of data provide +new constraints on research in this field. -Enter `Zarrtraj`, an `MDAnalysis` [@MDAnalysis:2016] `MDAKit` [@MDAKits:2023] which enables -streaming these trajectories from AWS S3, Google Cloud Buckets, and Azure Blob Storage and Data -Lakes without ever downloading them using the standard `MDAnalysis` trajectory reader API. -This is possible thanks to the `Zarr` [@Zarr:2024] package which allows streaming array-like -data from a variety of storage mediums and `Kerchunk`, which extends the capability of `Zarr` -by allowing it to read `HDF5` files in addition to `Zarr` files. Trajectory data can be streamed -in the `H5MD` format [@H5MD:2014], which builds on top of `HDF5`, and the experimental `ZarrMD` format, -which ports `H5MD` to the `Zarr` filetype. This work builds on the existing `MDAnalysis` `H5MDReader` -[@H5MDReader:2021], and similarly uses `NumPy` [@NumPy:2020] as a common interface in-between `MDAnalysis` -and the file storage medium. +Other groups in the field recognize this same need for adherence to +FAIR principles [@FAIR:2019] including the MDDB (Molecular Dynamics Data Bank), an EU-scale +repository for biosimulation data [@MDDB:2024] and MDverse, a prototype search engine +for publicly-available Gromacs simulation data [@MDverse:2024]. +While these efforts currently offer prototype solutions for indexing and +searching MD trajectory data, the problem of efficiently distributing the data remains. + +Though exposing download links on the open internet offers a simple solution to this problem, +on-disk representations of molecular dynamics trajectories often range in size +with large datasets up to TBs in scale [@ParallelAnalysis:2010] [@FoldingAtHome:2020], +so a solution which could prevent this +duplication of storage and unnecessary download step would provide greater utility +for the computational molecular sciences ecosystem. + +Enter `Zarrtraj`, the first fully-functioning tool to our knowledge that allows +streaming trajectories into analysis software using an established trajectory format. +`Zarrtraj` is implemented as an `MDAnalysis` [@MDAnalysis:2016] `MDAKit` [@MDAKits:2023] that +enables streaming MD trajectories in the popular `HDF5`-based H5MD format [@H5MD:2014] +from AWS S3, Google Cloud Buckets, and Azure Blob Storage & Data Lakes without ever downloading them. +This is possible thanks to the `Zarr` [@Zarr:2024] package which allows +streaming array-like data from a variety of storage mediums and [Kerchunk](https://github.com/fsspec/kerchunk), +which extends the capability of `Zarr` by allowing it to read `HDF5` files. +Because it implements the standard `MDAnalysis` trajectory reader API, +`Zarrtraj` can leverage `Zarr`'s ability to read a file in parallel to perform analysis +algorithms in parallel using the "split-apply-combine" paradigm. In addition to the `H5MD` format, +`Zarrtraj` can stream and write trajectories in the experimental `ZarrMD` +format, which ports the `H5MD` layout to the `Zarr` filetype. + +One imported, `Zarrtraj` allows passing trajectory URLs just like ordinary files: +```python +import zarrtraj +import MDAnalysis as mda - # Acknowledgements -Thank you to Google for supporting the Google Summer of Code program (GSoC) which provided -financial support for this project. Thank you to Dr. Hugo MacDermott-Opeskin and Dr. Yuxuan Zhuang -for their mentorship and feedback and to Dr. Jenna Swarthout Goddard for supporting the GSoC program at MDAnalysis. -Thank you to Dr. Oliver Beckstein and Edis Jakupovic for lending their expertise in H5MD and all things MDAnalysis. -Finally, thanks to Martin Durant, author of Kerchunk, for helping refine and merge features in his upstream codebase -necessary for this project. +Thank you to Dr. Jenna Swarthout Goddard for supporting the GSoC program at MDAnalysis. +Thank you to Martin Durant, author of Kerchunk, for helping refine and merge features in his upstream codebase +necessary for this project. Finally, thank you to Google for supporting the Google Summer of +Code program (GSoC) which provided financial support for this project. # References \ No newline at end of file From cce22adec8374b2320614003aa8f4f9fc4ca928f Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Thu, 26 Sep 2024 12:13:23 -0700 Subject: [PATCH 06/10] benchmak figures --- joss_paper/benchmark.png | Bin 0 -> 30141 bytes joss_paper/figure_1.ipynb | 78 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 joss_paper/benchmark.png create mode 100644 joss_paper/figure_1.ipynb diff --git a/joss_paper/benchmark.png b/joss_paper/benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..edafbd07c7c2eb7858fdc06b8153b9be93cd49f7 GIT binary patch literal 30141 zcmd?Sc|6vA*EXu@YNmM()k#qiB7`(M4JlJ9LoyFhGLOv~6iH>)AVN|hGf62KGiI(t zWF9h|hWA+R`+2VCkNxcZKKJ{+pU<=R{-e8G&hs~Xzu&div5s}DbzeVx=-~W0tLCt= zvCU`7?Nec6o5sz?HdS-hU-&Npb{Eg!FA1Cdnl`FdMmF|()`o0KdNyY)tZXbyPjTBB zTAwquvJ?>z7uddq+r-A^%sELx!PEc#0Rby(V?i#S4Ys(*>@#x5&ats^=+U1k&vr$a zva#9nGxzOQa|r6Fvv*ihJ~7batjgRleU|hBCyw+>{p=U7M5*mJm0!Gc$&`DyH}TKu zz013QI-mSO$&0$O3-}(FID0UrFH$M&o2jgE|1zf$v)4^~dF6+~{6ZJIK?A$aU=d@B zoD~~zZ?>1`Z^q4~e|S4Bkfpyhud+?WUvGt{FQ6aquRBkFy_`2?I{rFuIE$Np+_IN# z3jR8|a_TJlG4Fr!t8!xFqkYP@zss1(=YDAfY&ZSc))rmzRKdKzu_0!+w(RmFGx?+J!p{@hRE*j$3n*s~$8r;|Jv~ziW{5GdFe_%nl zX=%80g;Qtq$e*@V!UoqWVs$6>RZNKEZ$bQaxbIg@#ObQn`}ba*_oY1gXza}OgS>pZR!Gn?r<=7t|ZbmCS5Rg3k2Am8hAoyziz*@usDpi;9#T`&V#r-2aDjH z_8R;4UCx*ocb+kQI%8GTU-MUmz0gjr=IW|VR+V<@FYf-HtsZBj#Oiygw#r!Zmf}Y~KjSYod%QBtTQ+~Kq#9OOEyboagxFPcV$le{uSz85v~-dmhNQBOq}|u_|abVCkJjV>6w;)skg7MG_k=g@MPvI ztlwHm8#y;OH|4aebUNB9V)#|UJ}quorouIKR*2JyL`(HV%R040OYN=4W2cB;VL9acIXm8(~-Ajx)z{TZg*em^yX0=IYKL_tjba8c6k5u6q8>q1w9C2tL=&Xx2DhS}?=9a~79Le?J z5p^B0VT}&Gm?0!2G&0f_+fg@)sd#o| z!&>n(vIo656G9)Yl(aEW6$+97hrz^M2rtzOfky$ zTL=Gwo!HgQk}&wR;7?bBu8;3d`_6r-7xII59=*q}?&IaVoM2vgAI~v$+O+aGV`Uge-FL^F6|R5EkMbXT9Y&{S!Ged&bzP2laBv-Zc}v%A zct5_P0jEX#%C8<}QV<|I`~Ha(n@IWlTllc2O@+ZH^grHC?`liM@eNdab!tBj4xjTt z`>O3G-lz2SJ4AnBGyb$yDtD)uSH8%F6Nev4YD*E+PGPiGto-!;edYsvRrThwtqi89 zc}0BUm6{2i2u1(Li+JQWDFtko50yy|aqg5D`%|gQ-sW5@w$iHS$A>wyXWxm4i0J*9 z`=qDw<3^lBJ($G96&wkq0;|Wo3PS@N&=}L=!V)>_?nV~-hWAt-8zaTuolC}s63YI@RvUg&% z&+GA%Cx1FqyZBGMvFIHftUOmF9ntrz2pfNC$y+w%SAKpg8ap~p^Q%4Uep}?&kx=Q$ zD8*XG!6Av9{jqQHiWMdoFJ8>Yp0($x$LiD>eDcm(vPHAl1-`(j>GQpC=3U+cJ~(~P z_;k{e{6&m!AA4irAF283aZi8$_J~8luXnk3f6F@HlY4LTgVu)3%wx&cPgm}+4C?9W z5qc_rPk4t_eY_y+%Rw%G7xPmwUR6B?d=z@ zSz~d)bA?{T^JDrNFSJZUA8bABkQVJA1?a2@^E8FC}6<%G`C;z5kh6#e48UBpO zhPLA3_`OxV6e}K1Z55Lj2dp`vQb#23f#41-{|-0?qwK5m!*w&=jK)WY>EA}*-(3ku zFwV)#%UdIUhC8b*MAA0OwK~I<<(Zk8Ik~(-`d5!7nB8lwj17fn6+(P}CAfUqGNT*I zwwk%Pxa50qxtY+)xQ+H4ER8-Y7-!dUeDTVarW(&R^%(v~bsRs8J8^JInRp5rjg9=t zsjI7l(dEG8E4Lhv|5O%zwCJmaSl`%S!>;1CE$UIekt$E+FAhd)C0mE%567R75b?9` z(t2`MAMiQ$^5li|`r_il`kq2~dJ*h^O=oS@mSeBqZ_#}DRxL^`cW7wH&y_V$Rp}fn zHVSJ9T)5$YU#QziuEp1vS&E@Db-tU9yraW3K4_gO8Fe()lKwQ*-&(yxQBLj}oeA{gB+fN&rw66WVc~@6ayY!~d3LQOACgj21L0Zymu}YZ ztBNxY+;e4)KQ0s)XIxbMfCc;UEHDx&2;8w_?y@c0c1?`N!P8$qYilc-Iof;EH5s1g zLsCjgK$LoPz>K-e{IHGw)=i)NgCx!c%pCsg)gWk;cfY`8eE1jHOmo4GhV4FzUYyeH zdAGQB)zg|IyopP>zq>s9fY08*q|!SZ4tl1hrt%$rc-~*)T(|@R%HzD?UGDYsxMXVO zgctKFzD&G=gS1-T(^UtjkxnI@PQ?%@M>;KgwY_S1ZwVQ}N9TO_@S*J4k?bv6NyYhr zJBrkv9nP9QbLK_|U9Nr+CB@MO)<8#%QmS2tB0k*FmoYwK-q_fv-!ND^ZogplYI8Wo zlXSKKdkSC?1ruWp6aH}Vf$R6*@B;$dnZ|{^@9%18cz!(Lv*+p^g$6|-bzw?@6*eu! zSs7ZRGNjkpiYCD|=p5HMzo@hBHn>T?Ib-<^!K0P(Ne`2i59bQQM%tP1X zfN-}Marz(GS@D3vD~}#M+St_8v_NcpsF7nsF73g9Y0s1}42%kc{7r7b zpm-%EtLbdY38&h3y(wzL$+=??uITUc&C$N|8vunlqfTT^8qUK3pW%jy@n750Vg5Nr zasHVT<25V#8XoM2t5;G~^mI)=UGruo6IYh7ue&OgL(a*;$tj@mqt{MFtFJE~^$ZNy zJ?kl)=0EdnEL#a|oF%i1mp7+2&9U@nQE0>WKKU)@%VKpKe!@d#O))A8kw$1S&2P$k za7!r1W$3#HED*>?JJI?3yBQmD=W$AjXQka(x;a46U)aC9`;3R=&83@-tbcye`|;yP zCrh&N7lm<|t>YqrGOi=i4R_zymkR0UtY9|d2JCs9yWDD%l9T;+)H`JBIXO5a0ma-j zt$O{`wK~1JyW__M`|Y(mjTruLjSY$H`inT(l;EM|gm;a0U&^)y8W#4+$Kh`;WlWmQ z%a_|&@vxm$%a<<~IraV;tAYg>v>HEc$i-<_O**4j8lfCI-8Lt4CYHqBPsZZwwhg%! z^=VZr^xH3t-c^=4tJ(Sf`eJg1X#iR|EA-c|U#~dQ-@16s!w)$*0SLC@Yr-^emJMK8 z0A|@TTeob<$F`1NnD!|+)I%tdJxW_kD<8P6y{*5mPXw!Ft@%Rk&idJIn#_EFc5~pH zTZ|341_c4-@m3A>7PU!V^pat^JmRy&a{M+QeX&ANFn0R7zM-DRoYK&MD}@ctOj=WVV9hkS(T0}#bmtCHoA$3?R_HI9?hVI-Wkz_n z(QJUJ-D32WAJtu{KhC1HssLSLSYPdEpJvdk@}wT-lXE5z7HweQ^6% z(`#H>pFUW_nP^d^0(UoXoY9Je%B<#1%xi<(^{|ZO)YKZm*J*L7=NakU@8%A=#=t$r zS#=?f6eDmg+Cl+jDWA%Fw>!pZI02tmJORLxRJ5v1dZneU-Ql%&&mIQPcuH68Asny8 zT9<2H=^vyd24(?_<*0T&{U}1+DjAog8%y}sBP~cyPR<23oET?WJ?^qk`}9dJ=2+Z; zBI&`PwDk0)ix=xpR07xTKX9NrHEgSHh8DcS;b2Ky&XneB*Ji=WCrS}&H|gr?vTX4D zc$h;Ql|#xJ+$UTcDyzbkRwKp8edPQVpzsmkQ-WFf??-JZf%i5a<>%!+0K{wWpj;Bh zd;&+dQBwZUsXrmR+;y~5?bNZKwY9ZBJ3C(>&@tslyT0uPkkF8kk#XwzuzYR!$F6Ce zqk*x<;=;3Mq`_Tjz%6vvhX6&2aZZrM zJi zK?<#bO#UdmMpx=Bq#SKObMFR7JHM%S=~F_sYucReAElF?lH#x`QHeR8u&5(o;Mr;6_3QWSv}-?%%%SP~cRlwPv9VSc2mkEeRXXzsGU29zz@w_+ zieHc|$0Nw6+GQSpZEzi7C<3@z(jw6y*XJ{Qh-+%k=`PEL6U-P2-F1r_GZm0_w!hRG z?T6uSbe!)Xd3K>>h}lXJBVQy1x7SGY%H}mhTb+GlQMG7|0lDccYfeti+2J4ETi@%> zb4>-p5SOw$cTV-{yp`<wAGxAsFHG$n7^t|ucw_%)Ur}4mt<~&B zgYdOUE!~zOj?F76^A2?%Pp(e1ibZ0wbn9`ouJpl>^oj1L3J(^omEe`p`cSz-5CDPQ zt1&h?IWA|XHrRw0U5=+}nDW<8Ct5XR1Z>m0Snv32E%2~5aH93uvtO|J*c9v55}pt5 z-?!k17UKloX%X&m%@)*3dI%gKqf&76h1MOR=128$@9+QV?=Pd=Nm*uiA85zjLet^6Po*fCy2iBWc zsgy#&rhOiAx#3{`Y;Lw|5EH0#;^MMl!CR?-c4+r<#|;upOAm~(2JKr8a%FNQX^&>C z&j?5Eo4eu((9Xsb%7wo^xa-SpP}d2*2N~myn>Q6+DyO*CaIAmj)cIW7r{1A)zUy$) z0}7owaC7wU&v!O>;O7C^nx4VIqktprtZ1Bx$V0)BPFUSCEUokH_jZ}CW3aom;nIf! zJtv%%+Fg^+RbO%o=Xhrn2WN9=1RJ(=<;uvZGv+)yo?v!ez`3(lLOb25d>wb4fLrF$ ziHy1q=gpCI9jX0#8TxmG7XlERv$a)+=SgrGwhoqZRFSa$ne?l-XJ;`$1w(V7zqRzU zpOC&!rNcn`F+-q|adCJ7kSeb$U&FBl%eWo~fJqGf<84#vyj3dyN)_@&RiwB}mM(qT z;KowgaKQ60*psCoC*<$(^WjuTo@-XT_=~>jWu`mrnFXIYmHhfSY~lmAaBC=P17HZAFAyBuw1v+ zR_!r%d$XkEaR9fB;2IvW4$B|Ylus39j{W(W_dt36YO%7*vloMT+B?=?s=NxxUG}ju zk0bFWM-nXSj)T$k2Z{=vZft2~&&n7-biJdED?;h7>C-ULSlI+Is2t*H=$u zhQ3V$PB;SCo!Vv%6IX-Hc1o@{7(JFbG}r;RFnE_&X%*-=UcZp9`&n<(i)YMLPMhj7`wZB*LP^ZD;u`Xa?qh!*xcvAvL{8%~?bJ8Y zCUwbct*or_JgnoZEN0ht;l%P*D|OgWwhf<8l&-h;f_qy}h-GCC33gwdzxp@42($QB zhn@Y-OFtVMpMc^r$+|d$y$@Eo9fzdp)1AEtUJef{VHp|;CSUgGy8gLG!2(JiQ^=o%c7<|&_`&*+C`XtJTHk!WS znRJWs$ivshMK8w3SkX9#?CV+&0m**B0xwy;Iq;uT1)8n?Zx z#qZ;>A|6iKY{s(PWznD=DKjC886o}%@u)bCL@`&s&?eeZ?YX9q=;>-Tq+ey`vFYK0 z8AE#zP0fI{!L~>3zVsL0rcH8yO3dDoWm`14ckI|v9&_R-lAqFtyDw2FP5?^sQA@Tq z1Q-1bmt7_0@Uph94jj*xj{0=gt2eBynOrT1q)|9AK-Q1IhnbBI^{`hyfTKBxT{?_7 z{0F2mKRo?$kWNHE1Gsfu;IQs81`9$F(yZcnB!|q9Y-?#mYJbkB^Tt zUkA@D4`VJyVx)9s&QcYmz-|*`_P17wG-ntWGfZ(SVtxR#!{I*o#KkqgfB!D&(EZSD zeAw9eTT5MPVsl|IQIV~6_6_RD=4=~t=AZuhayJ(Im9Dur*pjU}X_`cl0Z88y(mya` z=1lYV*A_;dNPI!hm00hjfLJ^7J32PrzDuXTU*y-L!plgg6qfR1^@ zkVnGWsDn($HQKT{c9jv;2OJcQm7X(eRxu(N0JGTe(9l<_B!?aJLSRsciUNqldXY&V zDpqgSv1QoxB>?Z3Bl)Gv(r%%SVGi6uxDnNge!0pPhSArTdLu~r-~=|ax}y%>Ui0OF zu3IEhJf0-WEI>-4Y5{!0k$0P-o-s1i=v596rXF)#y(H#jPqle01)1ts`W|%B@mwu% zg4Jn`aW|F=zQBU!R#qO>b?LbR7U3cC&o+2UHN;XIJ#TXV!bgYTY&wbKq=ZK-z>O>p1&M!unE|j6_;~+K-mixM$`xT- zixn%^KH2fX>*60)S7spm=qkKV`k$xxcz=x1M<8hfuu|&YZYQ9oIkVaR4~e zH*B*78v#a&F*4G%#t=}6CtNT4DiL!z+1c3!NE^1Bm3^|Pj?ew>h)iDu`MW3*yKQ*i z@RL~=!I=xOZ@By3OH&yC+^1`Q>WO%rDF8O{6FChH4GsjaG4>|=W&IP-P4b1o@oT$()Et=zp@aji-;z8#W*XcoU*8bS%0Dy#5Ajt2+N%Z}= zDJVH_8d$qDL`zlFt7we@XcWPY=7YXod|KZ=dc&(D4v5hPDRv!ZuZ;@KkUlA5pYqR7 znWku95SDFiZ|)52X+5O1sISH<)1On?i5AEYw!a#w8|@BJOR$mVmlmVW-S7tiFm~kJ zf_G(Hol0X)>^Mg;q%8y>DH7$H^tC?=0xKy~*SjpE?^*Ry?~*Cw1IW3$V(*TTfIF3j@C!fI3(}OH0eXP}uB(4Y{LK z%+byiN<47cX<}?p3Gq!4DVk?sU|-xZw1iDX*Rc2_ZsreGkO9uxm#C3O;ZdN%rUn35kapn)78po%1VJUgZM%Uw?d>EYbzlaf?&B^Z&FwpS>RJtzTd1uL3oP zLlSUOPURbO_T0Ipphzem9vf$M9Zx#*C{DrYhmK>@QVPk(;*83%nW88^Pz`0(F1NHe zg=s!(r_MUB776c4CAjcI!2vBZ-sff=cC_gJNIo5IF50HGcIOs7lWXm+Rq`zs;dtoWMw zE4Il#dh}?(edD1ZhY}t-dvfnp?dN|=M{L?(5+kw>c95XF3kG;GZhc~3f4^@;#IMD% z9Qfedbe?DtZW5!wMa4V)F=NG}v~Pwqiz>d0H`B@M<_<{_;RZM^Mc&xqg(&lQ*Z9z0 zxQ(MArOQ+8t>$ydXrVlI6e*{u%g`A|XXj`Hy+hAe(H$;1{GR#=jGr2*FB-84@;(HN z5bRnz5*sSuXv@-_?X{Dw_ro<71C##z_3PJjA9_Aw}8{Bnq(C;>4&sh z$f#hl&>4|6oZ-k7IBx->lL#IlNggIdOi)AIwrl_T{pV*|>{{j5Z5P!O%+!gHrph_P z6ch|42tXw^mB04w-_M5%!*?$Fa2rpm2zOUn-|ann_o^dsC#R(FLp}rxR0X{290@|H z(*)9_Wg4^bX>Ot1&Uv3|lh28Q7$^vlJ^`sk8E}xOUAv}IsLa@lqLtfDp2wv+CJ-+4 z^ghrne$Xwz)m5f;WkgrRoKWzgLy9lr1#kOdgnCju@T$k zB=P$lzkh$;mWHdy3VXK~p5Kndn4S%mua2}Cuu1`op!%yYI9Qv^{X9#~Zpu%hEs+b8 zVWZTjgVO4xz606n+p4Ez*-0G#ca|McP{s3YoneRH?+m*sVcWc%w*urB%N)lS2aq&)*d>(NOQTpc13>Ib|~7?e(giK@pQONgUmfmQf_h(MZNf5tuPH zaDu1_1R(dHdnP~c?zKi1gx{cSo}-gf6zX4IDJ${NQoCdBhuGWAc91)_+SuA|wpY`2 z&x=c!Eel8H(i)pN?zAX+CO+n^$irl#ie<)VuL83K0ZI+fjyRol>(-5+G;sIS;>k5Q z&zQ}P5`-GCD}xD0>??EtNmeEQ{~`k)RB7!i=W*C!sy=3yqLwCp`ZG`YEsQg zH)Bw34=SQBkOufd4G7{e64-~C;6#tZC+nj%Y5eKEpEsibq5(%}J34fhSrTZK5s9zx ze&D6gg|XpMf6ZI*keFpiK-iZDS4OHH#HJ|SP@)gWP4NkvTY;>5yM3pY`g6@AKrAxp zsG1`BAO8ETjAaE-)cVu@2a^zdfX=N0}2x6D{_MLj5b-T zgic*X+V>ESZ8Nxd)PO1hE@&BX8rJ>2_#)h>*;QbBqrlK_wyIA<3g})^mVtH~WHivl&WHWc)P5;5F?x=fcp!ZwbAq)C+gSYz zC1^8vP{^i=8`T{=P%C~0)`6(v`+~YJ@O(t-%7d8kv82N(TjKnjt+zEIR6fP_FM}WN zud~<1NJ&YN=n<7F8J#QkDSB*X5za7zX<$uC*G8Ffpv1b*Uu&wRb%Pfcp6H<94rX5^4tHI&6#NeavMMM2wk3 z-eFX&MDa-j>yu#JByZbMt!|Yu91dRNY~?S{6+(LuItSd5!`v?;K>YSjL2bvJUJs)5 zoC`ytj%>7iR|v)p{A@YW_x4gMD-tIUo=d4;;(G9u*(fBMm^?!&6KIey<$QUeVw^amZhhs2Wgu7PyJR#GRG(E`x;Jg{^niv=_yWs zqRh&p?kx#?nHL}$jT-WgZ81=Jl0A2gtN~vESHWw#mR^xl$7i}Xhz?%AWysU87@HxJ z#4$;Q4zL9>UOT@5)KNFV$uUzVzrYk^zL>(a`5++hyEMYfHSBp9B-C7_>LlKgom1)= zvd}iz&k-nfd~9~oO?>JX&*k=kuw@98t)M@nkdMhL6Bm__(CgwL_l6GeL)A?a=iBvBC^5o#%Q>6Z9Q+dW{y+~v%mHQ!@xQFU3=&QBOx*0AKd}4Ha zZvTINmFI&QeI@J+ZV5l)y<{@?^?oO9%?2(&73^KdMCOB`rpXUkA`F0QvT4p=NqS+( zwiWv8xpG#ZbPlVN(YQmOQn`ANof{mzI)WPy6Eb3j*B2+ht%YxsCvKS_H~!7i=jVDG z<_%BJGAe37Ftr)lXO!TIWCGO!=@Z_b2pgmdr1~9{th|07MY-upKX`J3i@^>?F57yX zm|_${&kY?MqF*f#WKR8rz*U}TrHiajf=`2NocoGA77Emv8T=~q6+eBFPZMsrx9Ndy z6xnIK>0QJ@0Vo+ghGB30V|}F|5t>X;V`J=dkB*U-clg`#?3sCi=Hmpee0LNT@=Fc<~9!CYiD1Um>MnUw!@^n@CK1 z?bru#W?L-6(;2@I(6Nxe@c+lai2MipskChjhzx}Rg?|s+jPqq+Kd3xA_Nu_B6m_0S z?hEPB-(MHdga9*=iaj6=Hwg=?AmuNIdMz5_LGogM$JOcH035A=M8!}q#Gm_h2s*;n zdq;J?pkVRy%n`}+9_Nhy4f2}=X=9m{6X3vnel|6c%AJ57GeI8f%f_eZGs7FTmsZqI z%3(K4moNXdtqUSZ#i#=MyR=l0p;jqu@QFLHBhzw&Sl~Yx0a=wrGrXy2 z#^us|0qBK$`2)sE%}KwAU~l>zfzYfjMf5v4oww3F%u^0c_l|A{Xb96>ivM(e$rfhH z$y*3)RCxrPQ$vju{NK)et{s+I|IrH3#Xwd#2Q^9NY=J#w*6^WIKP8=Qoa~Zl|0?vi zv75_ENlU9k!~FMMrpn=P_M>vd_z*?k4Liv!nPCnE!SPhP7;rU@shk1pL!tQZV=>(u z=du+Ab>$oCbROlh{;?dQd>WHp^h#sJq-WgH_sLimm?8$m<|Z^7;La((#$@a|Ylolx z!8!h9NS`!;vEW`XH+UEdT?KHdTuTot{nOT zh?t?il|HSK^l=V%j$c?5-CUNx`|FdFb!|V+(XW!~^c!Y|L1HsFe;pLKZngi|zA2nL z&(A~W4b?D@(5{?c()ntd9K160Y2jhdD2&L?8Pu#ie`bu9^t?ybCQba!wxv&V3Ih(g zwJsLAZt6(@Exa>%FvvM`=bjs)yS{bulAaeU;sR_}oqR3=SiY-#L3ed(IAP^exvkp&!M7w|fI>#P7x_u!U?YJWr4QpSbYA&+L ztUFIn_%FRH({73ZXd|TfL5NNe{qv$EqJ>IRVq6}bt8O+QZ8oZ?IrXx%5B=W=Y~WA6NRfCXKRq1Bde}v>DzHC;4_b>u zFUZTh9bz6K2p`yJ`qo!-xj&iA)hsk)C3M}SE` zSJXs)0WDvw%!d!TZ}QXC&ZHWwmL(M$X_JUsKq32Vk>Sk$V5FLAJow=G8lF@E&72kK z=vSZ`6WS_c0hAjCC18`i{r&L|Ct<*#Py19kApGs90+2ic4SdA=Gl;s@(a|~dViNMC zGjJ_k_pt;b{GJ?`ZwS%wxD{a5zfqC&ia9I(X_O0zyTc=}`3U~kT21LY|I!h}&d8b> z1&-V&>ktmkkxa!}oj17kFO+8fxAe^Z%P;c3)T#Ubvb*}6W*dgqhGZSoP=|Eq3o_;) z?L~MGP_p-@7~p{Y27OfT+$BwIF=V55Dk`3)L%74QmGqK|Tqr!X zBFDG0Go>}@Tvs*28zrLOpwo{+4zeb^GWw`8>TYLM)amz}mjmPGI%qBz)KP;_1T;(g zHtBQC`LF|v$3@V&tAJ4$J3DFBTeh_4E*;^v+*jwmQcC@~rO5XkTk6*MoaZ<_eNfMK z_4N!+%hrMJp?B{}G*hIj<$CpPS2L5wPOyaZ^EXKE!ejhh19O19vKTzcVnqm#oz^F; zYwGRld5QyE`XV(mZu8sH+hP}4nl0^9yUU!9SIeO*VRVekE%ZL@73jX{_wRsj79W8p z7Y8l9zT@OuHmGMS$Hk$b1~$FW4Gftkwds%Le2h+TAk zRvz&2g2I_010sHa`(t*$D1JHxCt2M;H z-`iL0^zreQv&!)cYw7Lxu-=Xp9|#FLDABi2uxiEvCT0C-#*9E(9t9eVRBATZIq)sz z|K!s>3|7j5*a9UW&`KIA2+K*w1`&%QNhkho^N#(6i=oj4&6S<)!rrsMdqC(TzB_RT zHz#hqcXd{JvCNrW@pmEu3#5Jz9iuJn_V%f}HER;9=>BIt@ec450`14>9z;iqGBOk& zn>OxJqd1A=J>n-2H>gt;Xr3f@K!u+JL^pR#8{EHocUnt}$yzCg!(fUib-yR=Y=?xf zu*&fJ!iBPDl)(f|#s?HSpxllI_4CFo$7iiDJ?9Enx>|ofNsWf#3a(pLkL|7 ztr+%;mz`TB8L3&MeOG;Tx^p5a=TOny44v3WZ=Rrf#G$pu4?H6FL775AYa9h8svoKl ztWd9ah81^fg7D?-$*9N*81ktKem*%KS>^68Pw3BwUFY57bxp*ZNWCA+D{}2bO zgq#KWlREOIR>>*M#CN8U6*VDOoxPa11>9u`0>)G5W!g~dA<@*k>}_&?sW|p%=511ED>* zi)P+ZG;J(iyS8JSAV2@-Q4-If&z^eY4n-{}Tm}-{x;ftz<#|15Xe=9ISLo+M^q)Uh zFzqBb%>0S*!Cf0QW*g6)KVK9@HKOI4VblYyNP9#eCEgB+JoPuCg$H`MV0GRg#`f)H zKR@5gi?^sMp@t{DUBtAL4h&tw!$^IDHF-E^XutH9g);(?`v>mN6y1^4*r|(#^Xkl0 zVv>@?DXdd-^-@kw0WK~sNI~-;lFG;jM^Bvq<|r3Z@eb0mZ|`sDCE(zxN2xJ!c&p=# zy~0kg=IMKCX;}t4G-SA@()0$1Y0SaGSlinhhx&Y`3EO-WF}^~;~iXe&q84W;zS93WEG%YNo#3*-_t{O`4;hO zUBYL(zwHKp&xbROc?`4f-<)Wq-g;4}2BPyWFJ7r%1{o-k??qC5o&X$8XiI^q;p*HV zh#;q5PHp`l_3W)bysQQiTe4`;V@TdjaQp^Z6K8sYcZq~7kKPpPH=8T6IE;DM2ED|j zpM`|Fu!dBdw|3MQ;~k;1U1sb%Yh%jU#;cISL?VkZfmqeYU}DM!9jMW$B^{R0irP%! zro~>@*rul9;4Br?(L!ev6obR$UqP*zqhL62DX&J^k1XfTojcT&6eoZ63XTod$1WFo zVysCyT9BHWo5MkS@Ne0oh`yv7*RQ{F-%SnQAY6}Pql2|1o-bOvc57?Jg+|gjf@Mta zFHsJ)uML?0Xu}uOmH7iXF~Ys72m9Li zGvD`kj2*<33=$H-Mskhf&aQLfKZ`0J%U%&Tr}8%qTOihxT9B>g_B^tI?a?#=)F-~> zzt+10cS&3-wD1oYr1QWwKu+p|%iZI?{QmuW6Zkl2IPSxqSG?qwIQ!0aqr>ORSFZ*@ z^sXOQm5+Tv+o79Y4-8)sEhmaFlmgTx3efR@in@6|_VgAi(f;sUA_^Y+p$;`iVNnS@ zhtiodFONa@EYz&A^BB-NpKeAPA8IVgp!S7w{6G-+VKd{vOd8*xKNs?|EHnt-g}tDh z&&BE|F&u`GWK%yW^%cvw!mfEi;x~g#vOSP|<|1HC0FncGOf)UDmlg#{=u^3knphE| zPI(IPqOlTAYh$R(DH@$}n`>96Z-5@rC@TUBj9{;k*cVqPtno0&|67p!IihQ&y%bIT zr_fY$3I_b^vRRM#Ebd4xU?3DXgfSps$Nn2jgvh15e6q`~-7iu%GlP_ZsFxW)2?R3r z!xo*iuPgM|9C#}G*#>p_(s)zVKBP*1kirL|%gi4qV5R<=y>cLO@sc{sGCjv1K zE3Ia)oXaNksifec;wHXb@a=*Ok5r<~F^ToUJo3KVaIWKvQl&4W;`5H$zmw3wj1rIJ zt5Z_^Q|0*KKSb{WnfBG5D_W`VDdE@~M(xOvP~zj?1{a%9vqYRYrjJUA$u8AJ%a?zg zVl-#@gu+kAhNyp=JbA(9*tAm=K2R33^h{p52T7O}DrMy9{n3y{@4Bj1W?mmbEtD`$ z6L!73QJJ$8+&T;L)3HiwHy_a5VO$Vk=(+A;I4?}Q84@lLJpOj{L2rX#-3Y1ez;Mqx z22xI=x96uo(R+zYSmpXq7arV^d;4Y;uhPGl2Av=2|Gid36-svQm!s z`gxPhN8T2r0Mdqj+{{_=Q8l#Z$6h&z1oBa#eJ;RpsGl_FaFVM~Ve5lx-yh2O!X;ae zm*oeD<{@|$r6K)lM$JMrXGNUzpk?W6LxsC&7&-?i(5V(A@A;2T|HkHKBa%#`mU#*k zhdv^sQN;^wbHG6>cqU5$oivjiFD@&P;NQ+D%~HuA|AuI?60Qm`#RNHkB|3c!J(J6W zODiiY-QG%|)&N=k_wFZ}QtSe%F94{i@0xDpgeGF>)lI-)o}}17dhI#O1uV#HvE)Ey zyfB$?BqxVntYThq6OB|H9g`v@dS-TVbhP=!CfIHy!UkI5_mXC^qtSXp(`LADbU=kW z{rPPQjW7vxsrM8`z<)!p=tJ4eB(VabH}&Qj=KI}w;S{+KjSKxy{BA}OhI$66Ka7AY zdUYP-odvP7Xka@E-jllSM3K_LN9R`=X72&C9eAYH_~XYCXu=5H;FO>UqK>#lnktkg zBLCU3L)Tm@Fphgj)$k$cLkuOI`k&D#8ft6+_@L@JBN8r&YRaN$ zL*IDx7NfC!`gC)GruBcaR_KFaH>C+1Xih!lnfzz7>Zwc}E@`L9K_yYb-x*kY8tcF- zEL^hUT(mUxk<<7KUnDmB_U&7t?@3|NvDx1^|6A6jV9l*T>wvR^cG?$?1A%0RgA~gt zDk`GE13;YxGy;XbN}k-ZR-E-36H>P4ber3mEu4@Dm2xaWGBGLhhnzE;MlR6L2Kjzp z&?{R%eZ~xbR54fSTi*KQfYDZnsW;ybrijCNBZhmC%XKx_3~!DJVUC!iqXYm^fY$V& zZ^P{eRsMq8Ux0BT@MQWFzL6mnKwxcvY+E0=C?CLg@fI$weU@DTAGlmTF9m8xiQgQY zp`Pb(R~))jMWB8YCb2xtP{Aaf9XTt`p(&Pi&UATXHqMa})^O3hKaW=OSC~CL{xy zvuDlP1|TLv8-Qn|CJBm{8PxP2q`59+`++QZ9vImZ;8l{66HcSXEu1Qp-xQEf`hss? zwZrl`eAQ?0edyCH6j-@z7K+jsgSOQIlj*RHHlq#IO{qw=dQT-@o}q$6}lEi5b^ zYY%$&UcPcAa^LkunD%uI>e0zJwM|6GhzmdZCMkd}!QN$r(Y+W*6AKipOrd=wvE+6jlYqfc8~yBy6B zOcku%QmEbFK;9i|dN*~t_ma-Sn-?41lL!9%xrcT{a$gwfrQ&1L^m!>w%L4U#*-Pyt zJfEcNNc`BIFVwvRB$MLU7ml)-EoSOb3k3LqBWMBXM>F5=VEg(n&?xm2%bo0``5kk7 zXg&*d$T1Vy=h19H1eM8b5nXOTtb^{fwD68kG2Q?nWj~^b3aWqNNs2r_GLNE9wd~F5 z6GTA(Mji!QGNvPh6)grpqr?buQxl+nN*2z|_29u$0d0WoiXx9iO(+nvw=CFGS6f?# zHK1Gz?vyktnC!A{<591TIoInlzcG5o%n86k{bFEQo0QcZ8CmvEkjV#*FcLI;6?);W zdwOmMMnHT2wIU|abr>yQ0q9wicy&s1tDEjiSC$OKf>0zmf%5Z;|OlbxYCj#Mffh?Pdu zpfI-aniUsh{Gx>7pqJ%^XS4x7=HToc2v&^O;Sf6GT|XIxX-)T5v!H1<@RUK)&J|Tp z->%2Air|_yeqyN412l$ApE(@~x0t`M;cHzj&$4yE0qDl49v?y&;@!0A6O9U4r$xg; z2=f9SFq~d(S|{3cbK*OMj|$VX(#nP>|*Ct%h z&VQ2nIz8s*-x3N3nxZFquKDUmMw{&C?1y8KnlP<Qd?}M9-~k32YT&6;(i1(tA)_JT>NYuU*`*^`@8) zIxWPAJ%~*o2sq%yxfPv9%!xWL3nc3_>7pFND)!Axx~4l*g%{DFxuat%ummKI;;BnC z$q1>h)orA8qsNyU0B`NPH@JCZqK6gI91=IuJMF7L`> zbhlCK)s5S?E5JtZ)TLBxfG;eGS%Bt6Y3i{CgKc9J`Etb+#uXtBp(tx;ZZJSk3~5^H zY>mqiyGMhsU?>cYV2sg8KaSWzj>$BXvkSBi;y@6T1QBrFo88Ap4@Iu5xM8({rq}_f zQ-d%P2L*I%co!I*8{!d0ngFL7gEfo+hU14h7xJ$@1Yc{)WpfgwQUIoRg-x(;>w|0^ zNBK9-{eXy>TveykUzlw51tV6T!LPNHMXFA=Bw_3uYR1)jKjF0X14Z&-EJWd^m51KB z%0j=u&o2*uj$sB*;5~l+_^}?FLP}gxZ=leRT|0qes#+4WP9p+rA(pT8iI+?X_>Z@T z{rR*;XNi`?yjW;ynfTYiQP9w33Fb!${UahI*k@7=Pt>?@4a;>Ga**AtnLL(On-gz# zRXc()`qN%jDidTj+M)?n4b2%XZ+dsCCv*_JJ31=Yk*F2(43Xgm*%_ zQ;TsbA>Tq&+7(ndrpgH~Lw*I`(gePjk^xfw&@e-s<$+~S>yp$9x&(y zF%^aU;yt8hTrSYzc5< zq{0Kd^`(*{9-*X)CwouNcN?^Rk}?s@D*D)?!IPw8EEHvV(wBkcc4NmY@Z7B#{s9tA zCH7Kp06|nCpH_`IE~<*DgR1B=Cx(J>B~5JOGJNkhaezPBb(M+-f=hsLh+_`5I)b+V zAn-&$C3g!FrA8C;^=8kTw;kJVZxngvBI0Nrc_|#Ek1}hTam;Sv)95?20_BU&v-dPZ z4+Fmfp>qJ;Fv#Pna<}(`yT>RLC%oIX6+`3j-WTK4psT9Gn}0x`1G46-H>U#~p<-0T z2)KNxyMm4hZvpm3*3b+LA_N>Qg3Lh(x$q9)&?odjcx5qye>t_1cGS8EYR8juvY=yP zL7r?r^RTo57?DeP5lg)7)vH%V@N(fGN_fS^ztZR@nYtDFZmfqV+f=Lo(p{<<&*_tX4}@nEz6f(^Yn~& z?*yS44pBO}{DEO$17XlNqM(L_h)W@^8}EH}7|YCb$gAO*xQfj8?J=aLogTsAd#|ZGL`QhzeGbOgfg{Ck#o3I@;A}EDLz^d%N7-(8qcl zFBi{YJ>|@jfc$yr+9Nav)owg;d=JOw3Eg?e(Ic0fl+*&}!rr$Uv=3!bUsavn!sckEqs2TdpNm#*nl__UPWgV0O!N@Vn1JIklnsr2)tM2kjbP51yLNxPDGgIT<*E zrX*l+EXJw=4*O#6TY^K+DFiA-nzcrr1l5e%@gSTgevH)s7WaNHSS|N;Eww0M;q_FX$Mkl^^;*Q=BRRFB+DyQ zaVUpdy6N!S>*bA16w3esuh2+oSL9Vb3?#w`VG@@)n4}{o6%K_c8ae?w?KzHrN&0g3 zB#uw2TX6ERI~q8=WvJT`CV{N>tX~7zWzxPP``19?M(Q=n4Uyl2^S-k<+T$Ll&Bww& zyCRFSU0NG5+QAZ`m!O-WkgWAsqRXH<$z6CRo+7bXMkxO9URVNax-Mv9OWrf zf!LhtzM}d$^H?(;+uz8P>mD8MqtOwfXu~h?6Uuh0xV+1b+x8|$FvlvHDyuLVq#%{} z43#;kyoN!+RP90y27`O-*z+dfacLxW_V#L}&lu(Tbh`1riIFxz_*wbPv4KDoY)nIC zPFHV7Ca;UdxgR#&SdQ(wfTnT`5|lRBEX{ctd-xp;7Zvun+j{KvzSUwD#b9ZDL4DH5VBEbCnjg!t$Cw#xf&}0YwcFxA zVsOx#nN9+yH~rx|@#Z=1Y3#nHL6jmWEn?Wj#davnG4#uL47mYGIMUOLEmhv^%=QpE zK=pIrK5FPF1;UOzs_UwbIIzLtTgWVKrN`j2j+-W1W zYbWdkP3DHi9%k9o5e5)5@V%ChH_-|3PXm_}3gaUzdn%o^&>n$^q>epW zTRroPV9*ptc`7cWmqQim9@y_@8bU|yn2>x>jTUD`5e2%0&=SgqO3Z(HvY zCWRb;`jcFTvK_Ep(BFB|cA<<%d}#G+!<%DAJLo{*mC5SI_Natukwc&{2_PI(H#XzF81&?v8HO-G(`A4y|3Ih&cW`W= zZ7xrSxr&IsC#jzyKA6PE;1{$1L@a{H;{coE9OA=ZLu8hOw7Mi0qCyOuvNLN%jn8&; zvlwH*?K<_i>!Z5*#6d2T(-@ z4P2OcfbHySm^kR*!+6X9j9mtewrKQwJ*N!y1;KYlVZlc*%!u;BXoahb2^eHfjHe<& ze~b}S;x3&%V-_fn@xd87Ke+CIv3-Y8e453@rv@z*JH!Mq(ajK05@rcIvEe68eIk9K zj{BG`vjn1%$yVg!3G!5R4m&Y(0fZu|R2v^y)onvNkAo3`39ipT#jMc^KjURxZ~&u? z+jl$)bN!~bKsLyNO4Mk4xwdcvl2Se<<P&vnad?p8{8X^lQLm}pbt3QEr)7UA>8l$)5 z_o9ngTdWaN)lKn<-rip8ugcS>Pu~P3P)Ur~4~S!sza6!-bXgnr7e`+(oE1Bz;yC+1 zz6meAX0@8*+?zBU9(FHNE{DS|jp9|v)2OinS7fFvm7@Y76$xMp)YSqmlNx4dRzF!3 z)CeT*7wDDG`oXc~mJm&Wg6HESgat;I;#mm(4>>#*tY@O~rH0zHYtp`qSt@S;5=ixh zQtad$Bp@h{;U3rcrxVrhkbhCt0pQc4^|Du4qu#(=%q1RZn^zSYrsySNFc@sI!C^)1 zwc=dNM|%hc<{A7R8gI+cto{%XpoT+uIjj_uOHASF&>$jUETwXRZU}5iH-u}T&RR$S zbM)MBQqi#`L`qaTdYCLoU;uW3I@f_i&p9=&-M&p#L0?czNcsCRvbTgvJ8Mu9MkC@8 zLiY-$bzj_n@11{=@q*5#!gZ-=15^XSd$s*4&rnrfE_`kn09*?R&X8x5y#5(fM>MxS z?8Fmobky~3R^va5;GK>n2-7*9qciYM%Y~=l%Ey}TVnApK4TD;@_bRz=_?d<4M3+*9 z0r7+iD6FcRgmy7m4NaLT!~-L>ip24v)-UQc04E4TGTFe#H8}5GdaaDXAf@z}2}H9k zG0OiwoGtQdgSe{pOW)peRGs!bg2+0X;b1Y{JA&XhIc5)m#Xax@PjNo01h?$}Y=_yH zG>!s_tv6k@9BU=R01v2=f%MOt@ZoYDhb+!L$PmM5F|pt_c?vxLqquVq%Q;WOc*=3u z972W2XpGW0g-MbqB7ZTHq@AUfE5hd3P|h;m zFhf#8Vbaci?k@h@>)N^YpZ$;IeSe4V_j&H;e(vY~k|yL*7I4rohQuVzA4Jd^I(E!M z2Q$=*`+5PvbT4xo7^X(m8xJq-WB(!QEuCJ#4wUoROjHhxv}Evosy(8B=;}#vM8pYW z>KJ*X!XG6ExH%Rklp9<&WLzYCNUE@zO7123J|)21F_()_Gn1@D)s92#KyZjlI!y)I zMyX9DCOFgyd_Mf-&ER*L5*%{U%7Oxs=af9l#c?$a(NabVwqyE7GhrQ{HJP+^PTaxZ zlgTb?PRop)s50`9&m0*g|E|K+pxLgSa7;TX1S$PVEkn^atgg9L{*SuLANAvX{lU==zm$-6z_}#}{7S0f|minaIH*)Fano ztLmA5hJQ# z@~)kbn#-)&KK8I$u0Woe8c=7bQ4xK&p1 zFT-k$i`ZXZyGG_JKM)dD?Y%g2{o0ns`8qs;>J0ZqpKzPp8y_FIcB3%GBg_)d{>p8M z-i94*osW_$q@2L;;_7FSzaz!Je)s*U+fli%|CX}Pg^Aq5LfD*Dp zWjzC3Sy4iHUPsq9^Om-B9wXubM_RRpsPTQ67rj?g^BvlghX5_PBdG3GR)o`-MH;vH zHT3Dmgq1{IZ?>EI?LZxrq#U>~0y9%z>8KHjVr#Y0bJwH-uTd2>)>45w^T1eKA~Qv7 z_xkMB3?>@NFBUPwsY)K%cJ>3$XsMA34Sf;p6gv^AR5yT?A?*%S|kXe6)vBLb- z1)d`l-m^Z$^Tqz7+dVBE3)}qnd~$t4LB%B7I%nd)EJSHJ@G+ueb+5wZBe<5*LxsrmRa2OIxPn zutuy*v&-V1n0mgbmMV!Y6Y(p`nM+`&)&F`Gss022y%ax#lzGGc!VXz#~#-v`Hv`va5uGAjk z!KqnA-Z!0EKR=dG?zN(|ZJ}?)e+a#nap|2%e6y&TpKYGYh_+mj!Lm=fP#0 z_&ZoPYw?!Sd{cLU+VPzInj-%6x}r5P*hR#C*9{HartrBLAXC z-B~osh8UB<|6j^N?B?|GW-6DX1HZ~Tt`1T7t;h*{i*KrHq}`~i6sqPnrw87!@8J>F zXT%+^RuS2I^7uDt92X5@e^dd{Oah z2-gQc!gb*X=V4{axI$iB3EQPYEQ1~(W4|XN3{0fK>Z&sZ`S~kN61`9Sml>o74r-@n_mKah+g zYCXcfBPr>ESlO#s*=ZQV20D&c!(MSbrA1S6N| zQY+LVImy-#O+({)J7twGLflOXO?e4F@@2!&w~2e;!ClY^O5$LsDOsv_s?MZx=NdFM zH&357y3@hN)G>V~Y$>Dl=00-b(dt*rl5l}VB^RXPxp%=Qy~YfVF0Ljo-Ds6>k~*B( zj0jWmtUSOC1KUsZ=Ug3uWZ5#nvIrlEKbFTKDu;#Z>kNT$BY|X7XijJr$fnxbbL7NG zaEE;t4m;d-6(UOAWr(*R9tT0q2YmXdNxv?7n?}F7UTYw}v9JgtWtg977GA{5jg|Ni zq~M3R*Je}IErw(nblJnDJGZ;mNR@CO_@()FQd#p3uFIL!N;obbSSX%;FjM)_jrpRK zIKM$>9Pm7T7#L2P6P6>um$3=+mX!1)CzeHSG>h0rkoIw4JkIWWBK>{N#JHsNL!y$9 zT*C{PCG3YyT89QvguV_qB!SCPNPf6zEs_IVVlvKt@X6tCx$F>bMZ|sd%X^mE!hi;y z6_H5`OtPBu11uu?H9c%mqtJ)=a=inpPCwPqTa<3=c$0OwrTUz1%BL5VhZPX2N$`u@rMS0LaHs%Lp_l^C7SNOMVT2 z1C7IM&u;#BD1bJtCAZ_Xu(p*ct4}FL3ULra7Ay1lyq@lryeF?OcuS9)TP^BLfuh1& z&SUH%>)~Bu%-GDqCvdxX+#IbL+)$;a`zcsey(~#;e%pQ(3NqG_UarC})L7cM!ggSt~n^0dzD-|QvT2t#==Dv?OL0-E>QDUeO1ae`Ysoyn8vWR##fGO+S0Vh z^CN2apq#C)@-cB;%nHux7?5LfyG?)pynaSA_dQKNS>nzxQ}UmB7bIS9a?VSR;g9}< z``R%gXTfz_FXFU=bP_}eYBk)xjiZuU`It8Oju|@X4#6n=YK933;cvDve3)wseQ=A{ kCxagRzl=|x{mm?7^s7LpL0z_UMJ5x+P4(PAHgNsF0n%RxzyJUM literal 0 HcmV?d00001 diff --git a/joss_paper/figure_1.ipynb b/joss_paper/figure_1.ipynb new file mode 100644 index 0000000..3bb22ff --- /dev/null +++ b/joss_paper/figure_1.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate iteration speed figure for JOSS paper. Iteration times come from https://becksteinlab.github.io/zarrtraj/#/" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "labels = ['SSD, XTC', 'SSD, H5MD', 'AWS S3, H5MD', 'SSD, ZarrMD', 'AWS S3, ZarrMD']\n", + "values = [1.49, 4.76, 10.30,3.10, 6.53] \n", + "colors = ['#009e73', '#e69f00', '#e69f00','#56b4e9', '#56b4e9']\n", + "\n", + "plt.figure(figsize=(8, 6))\n", + "plt.bar(labels, values, color=colors)\n", + "\n", + "\n", + "plt.title('Comparison of Trajectory Iteration Speed by Storage Medium')\n", + "plt.ylabel('Time (minutes)')\n", + "\n", + "\n", + "plt.xticks(rotation=45, ha='right')\n", + "plt.tight_layout()\n", + "\n", + "plt.savefig('benchmark.png')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "zarrtraj", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 60104e43aa22ee80914b7b1cd2d54dda446a9e26 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Thu, 26 Sep 2024 12:16:22 -0700 Subject: [PATCH 07/10] minor tweaks --- joss_paper/paper.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/joss_paper/paper.md b/joss_paper/paper.md index ecc248a..0223c6e 100644 --- a/joss_paper/paper.md +++ b/joss_paper/paper.md @@ -13,9 +13,6 @@ authors: - name: Hugo Macdermott-Opeskin orcid: 0000-0002-7393-7457 affiliation: 1 - - name: Oliver Beckstein - orcid: 000-0003-1340-0831 - affiliation: 1 - name: Edis Jakupovic affiliation: 1 - name: Yuxuan Zhuang @@ -24,6 +21,9 @@ authors: - name: Richard J Gowers orcid: 0000-0002-3241-1846 affiliations: 1 + - name: Oliver Beckstein + orcid: 000-0003-1340-0831 + affiliation: 1 affiliations: - name: Placeholder index: 1 @@ -91,7 +91,7 @@ u = mda.Universe("sample_topology.top", "s3://sample-bucket-name/trajectory.h5md Initial benchmarks show that `Zarrtraj` can iterate through an AWS S3 cloud trajectory (load into memory one frame at a time) at roughly 1/2 or 1/3 the speed it can iterate through the same trajectory from disk and roughly -1/5 to 1/10 the speed it can iterate through the same trajectory in XTC format \autoref{fig:benchmark}. +1/5 to 1/10 the speed it can iterate through the same trajectory on disk in XTC format \autoref{fig:benchmark}. However, it should be noted that this speed is influenced by network latency and that writing parallelized algorithms can offset this loss of speed. From d83a5dfd41544b30831367ab93269864c1318517 Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Tue, 1 Oct 2024 15:33:34 -0700 Subject: [PATCH 08/10] update GSOC acknowledgement in joss_paper/paper.md --- joss_paper/paper.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/joss_paper/paper.md b/joss_paper/paper.md index 0223c6e..c7da231 100644 --- a/joss_paper/paper.md +++ b/joss_paper/paper.md @@ -115,7 +115,6 @@ geosciences community to align data practices with FAIR principles [@PANGEO:2022 # Acknowledgements Thank you to Dr. Jenna Swarthout Goddard for supporting the GSoC program at MDAnalysis. Thank you to Martin Durant, author of Kerchunk, for helping refine and merge features in his upstream codebase -necessary for this project. Finally, thank you to Google for supporting the Google Summer of -Code program (GSoC) which provided financial support for this project. +necessary for this project. LW was a participant in the Google Summer of Code 2024 program. # References \ No newline at end of file From 96e998af943fda24875bd3b402a701996958c4f2 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Thu, 3 Oct 2024 04:15:28 +0000 Subject: [PATCH 09/10] remove frame seq --- zarrtraj/ZARR.py | 216 +++++++++++------------------------------------ 1 file changed, 50 insertions(+), 166 deletions(-) diff --git a/zarrtraj/ZARR.py b/zarrtraj/ZARR.py index bd23918..402f712 100644 --- a/zarrtraj/ZARR.py +++ b/zarrtraj/ZARR.py @@ -233,7 +233,9 @@ class ZARRH5MDReader(base.ReaderBase): @due.dcite( Doi("10.1002/jcc.21787"), description="MDAnalysis 2011", path=__name__ ) - @due.dcite(Doi("10.5281/zenodo.3773449"), description="Zarr", path=__name__) + @due.dcite( + Doi("10.5281/zenodo.3773449"), description="Zarr", path=__name__ + ) @store_init_arguments def __init__( self, @@ -313,8 +315,7 @@ def __init__( # Set to none so close() can be called self._file = None self._cache = None - # Read first timestep - self._frame_seq = collections.deque([0]) + if not HAS_ZARR: raise RuntimeError("Please install zarr") super(ZARRH5MDReader, self).__init__(filename, **kwargs) @@ -397,7 +398,7 @@ def __init__( self._global_steparray, self._stepmaps, ) - self._cache.update_frame_seq(self._frame_seq) + self._read_next_timestep() def _set_translated_units(self): @@ -628,22 +629,14 @@ def _read_next_timestep(self): def _read_frame(self, frame): """reads data from h5md-formatted file and copies to current timestep""" - # frame seq update case 1: read called from iterator-like context - if not self._frame_seq: - self._frame_seq = None - self._cache.update_frame_seq(self._frame_seq) - raise StopIteration + if frame < 0 or frame >= self.n_frames: + raise IOError("Frame index out of range") - self._frame = self._cache.load_frame() + self._frame = self._cache.load_frame(frame) if self.convert_units: self._convert_units() - # frame seq update case 2: read called from __getitem__-like context - if len(self._frame_seq) == 0: - self._frame_seq = None - self._cache.update_frame_seq(self._frame_seq) - return self.ts def _convert_units(self): @@ -667,7 +660,6 @@ def _convert_units(self): def close(self): """close reader""" - self._frame_seq = None if self._cache is not None: self._cache.cleanup() if self._file is not None: @@ -687,150 +679,6 @@ def Writer(self, filename, n_atoms=None, **kwargs): kwargs.setdefault("forces", ("force" in self._elements)) return ZARRMDWriter(filename, n_atoms, **kwargs) - def __getitem__(self, frame): - """Return the Timestep corresponding to *frame*. - - If `frame` is a integer then the corresponding frame is - returned. Negative numbers are counted from the end. - - If frame is a :class:`slice` then an iterator is returned that - allows iteration over that part of the trajectory. - - Note - ---- - *frame* is a 0-based frame index. - - Note - ---- - ZARRH5MDReader overrides this method to get - access to the the sequence of frames - the user wants. - """ - if isinstance(frame, numbers.Integral): - frame = self._apply_limits(frame) - if self._frame_seq is None: - self._frame_seq = collections.deque([frame]) - self._cache.update_frame_seq(self._frame_seq) - return self._read_frame_with_aux(frame) - elif isinstance(frame, (list, np.ndarray)): - if len(frame) != 0 and isinstance(frame[0], (bool, np.bool_)): - # Avoid having list of bools - frame = np.asarray(frame, dtype=bool) - # Convert bool array to int array - frame = np.arange(len(self))[frame] - if isinstance(frame, np.ndarray): - frame = frame.tolist() - if self._frame_seq is None: - self._frame_seq = collections.deque(frame) - self._cache.update_frame_seq(self._frame_seq) - return base.FrameIteratorIndices(self, frame) - elif isinstance(frame, slice): - start, stop, step = self.check_slice_indices( - frame.start, frame.stop, frame.step - ) - if self._frame_seq is None: - self._frame_seq = collections.deque(range(start, stop, step)) - self._cache.update_frame_seq(self._frame_seq) - if start == 0 and stop == len(self) and step == 1: - return base.FrameIteratorAll(self) - else: - return base.FrameIteratorSliced(self, frame) - else: - raise TypeError( - "Trajectories must be an indexed using an integer," - " slice or list of indices" - ) - - def __iter__(self): - """Iterate over all frames in the trajectory - - Note - ---- - ZARRH5MDReader overrides this method to get - access to the the sequence of frames - the user wants. - """ - self._reopen() - self._frame_seq = collections.deque(range(0, self.n_frames)) - self._cache.update_frame_seq(self._frame_seq) - return self - - def next(self): - if self._frame_seq is None and self._frame + 1 < self.n_frames: - self._frame_seq = collections.deque([self._frame + 1]) - self._cache.update_frame_seq(self._frame_seq) - elif self._frame_seq is None: - self.rewind() - raise StopIteration from None - try: - ts = self._read_next_timestep() - except (EOFError, IOError): - self.rewind() - raise StopIteration from None - else: - for auxname, reader in self._auxs.items(): - ts = self._auxs[auxname].update_ts(ts) - - ts = self._apply_transformations(ts) - - return ts - - def iter_as_aux(self, auxname): - """Iterate over the trajectory with an auxiliary reader - - Note - ---- - ZARRH5MDReader overrides this method to get - access to the the sequence of frames - the user wants. - """ - aux = self._check_for_aux(auxname) - self._reopen() - self._frame_seq = collections.deque(range(0, self.n_frames)) - self._cache.update_frame_seq(self._frame_seq) - aux._restart() - while True: - try: - yield self.next_as_aux(auxname) - except StopIteration: - return - - def copy(self): - """Return independent copy of this Reader. - - New Reader will have its own file handle and can seek/iterate - independently of the original. - - Will also copy the current state of the Timestep held in the original - Reader. - - Note - ---- - ZARRH5MDReader overrides this method to get - access to the the sequence of frames - the user wants. - - .. versionchanged:: 2.2.0 - Arguments used to construct the reader are correctly captured and - passed to the creation of the new class. Previously the only - ``n_atoms`` was passed to class copies, leading to a class created - with default parameters which may differ from the original class. - """ - - new = self.__class__(**self._kwargs) - - if self.transformations: - new.add_transformations(*self.transformations) - # seek the new reader to the same frame we started with - new[self.ts.frame] - # then copy over the current Timestep in case it has - # been modified since initial load - new.ts = self.ts.copy() - new._cache._timestep = new.ts - for auxname, auxread in self._auxs.items(): - new.add_auxiliary(auxname, auxread.copy()) - return new - @property def n_frames(self): """number of frames in trajectory""" @@ -870,6 +718,41 @@ def parse_n_atoms(filename, group=None, so=None): "You must include a topology file." ) + def copy(self): + """Return independent copy of this Reader. + + New Reader will have its own file handle and can seek/iterate + independently of the original. + + Will also copy the current state of the Timestep held in the original + Reader. + + Note + ---- + ZARRH5MDReader overrides this method to copy + the copied reader's timestep to the cache's timestep + + .. versionchanged:: 2.2.0 + Arguments used to construct the reader are correctly captured and + passed to the creation of the new class. Previously the only + ``n_atoms`` was passed to class copies, leading to a class created + with default parameters which may differ from the original class. + """ + + new = self.__class__(**self._kwargs) + + if self.transformations: + new.add_transformations(*self.transformations) + # seek the new reader to the same frame we started with + new[self.ts.frame] + # then copy over the current Timestep in case it has + # been modified since initial load + new.ts = self.ts.copy() + new._cache._timestep = new.ts + for auxname, auxread in self._auxs.items(): + new.add_auxiliary(auxname, auxread.copy()) + return new + class H5MDElementBuffer: def __init__( @@ -996,9 +879,9 @@ def flush(self): if num_v_frames == 0: num_v_frames = self._val_frames_per_chunk - self._val[self._val_idx - num_v_frames : self._val_idx] = self._val_buf[ - :num_v_frames - ] + self._val[self._val_idx - num_v_frames : self._val_idx] = ( + self._val_buf[:num_v_frames] + ) self._val.resize(self._val_idx, *self._val_chunks[1:]) num_t_frames = self._t_idx % self._t_frames_per_chunk @@ -1248,7 +1131,9 @@ def __init__( protocol = get_protocol(filename) if protocol not in ZARRTRAJ_NETWORK_PROTOCOLS and protocol != "file": - raise ValueError(f"Unsupported protocol '{protocol}' for Zarrtraj.") + raise ValueError( + f"Unsupported protocol '{protocol}' for Zarrtraj." + ) if protocol in ZARRTRAJ_EXPERIMENTAL_PROTOCOLS: warnings.warn( f"Zarrtraj is using the experimental protocol '{protocol}' " @@ -1649,9 +1534,8 @@ def update_desired_dsets( self._global_steparray = global_steparray self._stepmaps = stepmaps - def load_frame(self): + def load_frame(self, frame): """Reader responsible for raising StopIteration when no more frames""" - frame = self._frame_seq.popleft() self._load_timestep_frame(frame) return frame From bda675d1c49d60b926a53c4f6a0f84d6e871f20e Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Thu, 3 Oct 2024 20:12:41 +0000 Subject: [PATCH 10/10] remove paper changes from this branch --- joss_paper/benchmark.png | Bin 30141 -> 0 bytes joss_paper/figure_1.ipynb | 78 ----------------- joss_paper/paper.bib | 178 -------------------------------------- joss_paper/paper.md | 120 ------------------------- 4 files changed, 376 deletions(-) delete mode 100644 joss_paper/benchmark.png delete mode 100644 joss_paper/figure_1.ipynb delete mode 100644 joss_paper/paper.bib delete mode 100644 joss_paper/paper.md diff --git a/joss_paper/benchmark.png b/joss_paper/benchmark.png deleted file mode 100644 index edafbd07c7c2eb7858fdc06b8153b9be93cd49f7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 30141 zcmd?Sc|6vA*EXu@YNmM()k#qiB7`(M4JlJ9LoyFhGLOv~6iH>)AVN|hGf62KGiI(t zWF9h|hWA+R`+2VCkNxcZKKJ{+pU<=R{-e8G&hs~Xzu&div5s}DbzeVx=-~W0tLCt= zvCU`7?Nec6o5sz?HdS-hU-&Npb{Eg!FA1Cdnl`FdMmF|()`o0KdNyY)tZXbyPjTBB zTAwquvJ?>z7uddq+r-A^%sELx!PEc#0Rby(V?i#S4Ys(*>@#x5&ats^=+U1k&vr$a zva#9nGxzOQa|r6Fvv*ihJ~7batjgRleU|hBCyw+>{p=U7M5*mJm0!Gc$&`DyH}TKu zz013QI-mSO$&0$O3-}(FID0UrFH$M&o2jgE|1zf$v)4^~dF6+~{6ZJIK?A$aU=d@B zoD~~zZ?>1`Z^q4~e|S4Bkfpyhud+?WUvGt{FQ6aquRBkFy_`2?I{rFuIE$Np+_IN# z3jR8|a_TJlG4Fr!t8!xFqkYP@zss1(=YDAfY&ZSc))rmzRKdKzu_0!+w(RmFGx?+J!p{@hRE*j$3n*s~$8r;|Jv~ziW{5GdFe_%nl zX=%80g;Qtq$e*@V!UoqWVs$6>RZNKEZ$bQaxbIg@#ObQn`}ba*_oY1gXza}OgS>pZR!Gn?r<=7t|ZbmCS5Rg3k2Am8hAoyziz*@usDpi;9#T`&V#r-2aDjH z_8R;4UCx*ocb+kQI%8GTU-MUmz0gjr=IW|VR+V<@FYf-HtsZBj#Oiygw#r!Zmf}Y~KjSYod%QBtTQ+~Kq#9OOEyboagxFPcV$le{uSz85v~-dmhNQBOq}|u_|abVCkJjV>6w;)skg7MG_k=g@MPvI ztlwHm8#y;OH|4aebUNB9V)#|UJ}quorouIKR*2JyL`(HV%R040OYN=4W2cB;VL9acIXm8(~-Ajx)z{TZg*em^yX0=IYKL_tjba8c6k5u6q8>q1w9C2tL=&Xx2DhS}?=9a~79Le?J z5p^B0VT}&Gm?0!2G&0f_+fg@)sd#o| z!&>n(vIo656G9)Yl(aEW6$+97hrz^M2rtzOfky$ zTL=Gwo!HgQk}&wR;7?bBu8;3d`_6r-7xII59=*q}?&IaVoM2vgAI~v$+O+aGV`Uge-FL^F6|R5EkMbXT9Y&{S!Ged&bzP2laBv-Zc}v%A zct5_P0jEX#%C8<}QV<|I`~Ha(n@IWlTllc2O@+ZH^grHC?`liM@eNdab!tBj4xjTt z`>O3G-lz2SJ4AnBGyb$yDtD)uSH8%F6Nev4YD*E+PGPiGto-!;edYsvRrThwtqi89 zc}0BUm6{2i2u1(Li+JQWDFtko50yy|aqg5D`%|gQ-sW5@w$iHS$A>wyXWxm4i0J*9 z`=qDw<3^lBJ($G96&wkq0;|Wo3PS@N&=}L=!V)>_?nV~-hWAt-8zaTuolC}s63YI@RvUg&% z&+GA%Cx1FqyZBGMvFIHftUOmF9ntrz2pfNC$y+w%SAKpg8ap~p^Q%4Uep}?&kx=Q$ zD8*XG!6Av9{jqQHiWMdoFJ8>Yp0($x$LiD>eDcm(vPHAl1-`(j>GQpC=3U+cJ~(~P z_;k{e{6&m!AA4irAF283aZi8$_J~8luXnk3f6F@HlY4LTgVu)3%wx&cPgm}+4C?9W z5qc_rPk4t_eY_y+%Rw%G7xPmwUR6B?d=z@ zSz~d)bA?{T^JDrNFSJZUA8bABkQVJA1?a2@^E8FC}6<%G`C;z5kh6#e48UBpO zhPLA3_`OxV6e}K1Z55Lj2dp`vQb#23f#41-{|-0?qwK5m!*w&=jK)WY>EA}*-(3ku zFwV)#%UdIUhC8b*MAA0OwK~I<<(Zk8Ik~(-`d5!7nB8lwj17fn6+(P}CAfUqGNT*I zwwk%Pxa50qxtY+)xQ+H4ER8-Y7-!dUeDTVarW(&R^%(v~bsRs8J8^JInRp5rjg9=t zsjI7l(dEG8E4Lhv|5O%zwCJmaSl`%S!>;1CE$UIekt$E+FAhd)C0mE%567R75b?9` z(t2`MAMiQ$^5li|`r_il`kq2~dJ*h^O=oS@mSeBqZ_#}DRxL^`cW7wH&y_V$Rp}fn zHVSJ9T)5$YU#QziuEp1vS&E@Db-tU9yraW3K4_gO8Fe()lKwQ*-&(yxQBLj}oeA{gB+fN&rw66WVc~@6ayY!~d3LQOACgj21L0Zymu}YZ ztBNxY+;e4)KQ0s)XIxbMfCc;UEHDx&2;8w_?y@c0c1?`N!P8$qYilc-Iof;EH5s1g zLsCjgK$LoPz>K-e{IHGw)=i)NgCx!c%pCsg)gWk;cfY`8eE1jHOmo4GhV4FzUYyeH zdAGQB)zg|IyopP>zq>s9fY08*q|!SZ4tl1hrt%$rc-~*)T(|@R%HzD?UGDYsxMXVO zgctKFzD&G=gS1-T(^UtjkxnI@PQ?%@M>;KgwY_S1ZwVQ}N9TO_@S*J4k?bv6NyYhr zJBrkv9nP9QbLK_|U9Nr+CB@MO)<8#%QmS2tB0k*FmoYwK-q_fv-!ND^ZogplYI8Wo zlXSKKdkSC?1ruWp6aH}Vf$R6*@B;$dnZ|{^@9%18cz!(Lv*+p^g$6|-bzw?@6*eu! zSs7ZRGNjkpiYCD|=p5HMzo@hBHn>T?Ib-<^!K0P(Ne`2i59bQQM%tP1X zfN-}Marz(GS@D3vD~}#M+St_8v_NcpsF7nsF73g9Y0s1}42%kc{7r7b zpm-%EtLbdY38&h3y(wzL$+=??uITUc&C$N|8vunlqfTT^8qUK3pW%jy@n750Vg5Nr zasHVT<25V#8XoM2t5;G~^mI)=UGruo6IYh7ue&OgL(a*;$tj@mqt{MFtFJE~^$ZNy zJ?kl)=0EdnEL#a|oF%i1mp7+2&9U@nQE0>WKKU)@%VKpKe!@d#O))A8kw$1S&2P$k za7!r1W$3#HED*>?JJI?3yBQmD=W$AjXQka(x;a46U)aC9`;3R=&83@-tbcye`|;yP zCrh&N7lm<|t>YqrGOi=i4R_zymkR0UtY9|d2JCs9yWDD%l9T;+)H`JBIXO5a0ma-j zt$O{`wK~1JyW__M`|Y(mjTruLjSY$H`inT(l;EM|gm;a0U&^)y8W#4+$Kh`;WlWmQ z%a_|&@vxm$%a<<~IraV;tAYg>v>HEc$i-<_O**4j8lfCI-8Lt4CYHqBPsZZwwhg%! z^=VZr^xH3t-c^=4tJ(Sf`eJg1X#iR|EA-c|U#~dQ-@16s!w)$*0SLC@Yr-^emJMK8 z0A|@TTeob<$F`1NnD!|+)I%tdJxW_kD<8P6y{*5mPXw!Ft@%Rk&idJIn#_EFc5~pH zTZ|341_c4-@m3A>7PU!V^pat^JmRy&a{M+QeX&ANFn0R7zM-DRoYK&MD}@ctOj=WVV9hkS(T0}#bmtCHoA$3?R_HI9?hVI-Wkz_n z(QJUJ-D32WAJtu{KhC1HssLSLSYPdEpJvdk@}wT-lXE5z7HweQ^6% z(`#H>pFUW_nP^d^0(UoXoY9Je%B<#1%xi<(^{|ZO)YKZm*J*L7=NakU@8%A=#=t$r zS#=?f6eDmg+Cl+jDWA%Fw>!pZI02tmJORLxRJ5v1dZneU-Ql%&&mIQPcuH68Asny8 zT9<2H=^vyd24(?_<*0T&{U}1+DjAog8%y}sBP~cyPR<23oET?WJ?^qk`}9dJ=2+Z; zBI&`PwDk0)ix=xpR07xTKX9NrHEgSHh8DcS;b2Ky&XneB*Ji=WCrS}&H|gr?vTX4D zc$h;Ql|#xJ+$UTcDyzbkRwKp8edPQVpzsmkQ-WFf??-JZf%i5a<>%!+0K{wWpj;Bh zd;&+dQBwZUsXrmR+;y~5?bNZKwY9ZBJ3C(>&@tslyT0uPkkF8kk#XwzuzYR!$F6Ce zqk*x<;=;3Mq`_Tjz%6vvhX6&2aZZrM zJi zK?<#bO#UdmMpx=Bq#SKObMFR7JHM%S=~F_sYucReAElF?lH#x`QHeR8u&5(o;Mr;6_3QWSv}-?%%%SP~cRlwPv9VSc2mkEeRXXzsGU29zz@w_+ zieHc|$0Nw6+GQSpZEzi7C<3@z(jw6y*XJ{Qh-+%k=`PEL6U-P2-F1r_GZm0_w!hRG z?T6uSbe!)Xd3K>>h}lXJBVQy1x7SGY%H}mhTb+GlQMG7|0lDccYfeti+2J4ETi@%> zb4>-p5SOw$cTV-{yp`<wAGxAsFHG$n7^t|ucw_%)Ur}4mt<~&B zgYdOUE!~zOj?F76^A2?%Pp(e1ibZ0wbn9`ouJpl>^oj1L3J(^omEe`p`cSz-5CDPQ zt1&h?IWA|XHrRw0U5=+}nDW<8Ct5XR1Z>m0Snv32E%2~5aH93uvtO|J*c9v55}pt5 z-?!k17UKloX%X&m%@)*3dI%gKqf&76h1MOR=128$@9+QV?=Pd=Nm*uiA85zjLet^6Po*fCy2iBWc zsgy#&rhOiAx#3{`Y;Lw|5EH0#;^MMl!CR?-c4+r<#|;upOAm~(2JKr8a%FNQX^&>C z&j?5Eo4eu((9Xsb%7wo^xa-SpP}d2*2N~myn>Q6+DyO*CaIAmj)cIW7r{1A)zUy$) z0}7owaC7wU&v!O>;O7C^nx4VIqktprtZ1Bx$V0)BPFUSCEUokH_jZ}CW3aom;nIf! zJtv%%+Fg^+RbO%o=Xhrn2WN9=1RJ(=<;uvZGv+)yo?v!ez`3(lLOb25d>wb4fLrF$ ziHy1q=gpCI9jX0#8TxmG7XlERv$a)+=SgrGwhoqZRFSa$ne?l-XJ;`$1w(V7zqRzU zpOC&!rNcn`F+-q|adCJ7kSeb$U&FBl%eWo~fJqGf<84#vyj3dyN)_@&RiwB}mM(qT z;KowgaKQ60*psCoC*<$(^WjuTo@-XT_=~>jWu`mrnFXIYmHhfSY~lmAaBC=P17HZAFAyBuw1v+ zR_!r%d$XkEaR9fB;2IvW4$B|Ylus39j{W(W_dt36YO%7*vloMT+B?=?s=NxxUG}ju zk0bFWM-nXSj)T$k2Z{=vZft2~&&n7-biJdED?;h7>C-ULSlI+Is2t*H=$u zhQ3V$PB;SCo!Vv%6IX-Hc1o@{7(JFbG}r;RFnE_&X%*-=UcZp9`&n<(i)YMLPMhj7`wZB*LP^ZD;u`Xa?qh!*xcvAvL{8%~?bJ8Y zCUwbct*or_JgnoZEN0ht;l%P*D|OgWwhf<8l&-h;f_qy}h-GCC33gwdzxp@42($QB zhn@Y-OFtVMpMc^r$+|d$y$@Eo9fzdp)1AEtUJef{VHp|;CSUgGy8gLG!2(JiQ^=o%c7<|&_`&*+C`XtJTHk!WS znRJWs$ivshMK8w3SkX9#?CV+&0m**B0xwy;Iq;uT1)8n?Zx z#qZ;>A|6iKY{s(PWznD=DKjC886o}%@u)bCL@`&s&?eeZ?YX9q=;>-Tq+ey`vFYK0 z8AE#zP0fI{!L~>3zVsL0rcH8yO3dDoWm`14ckI|v9&_R-lAqFtyDw2FP5?^sQA@Tq z1Q-1bmt7_0@Uph94jj*xj{0=gt2eBynOrT1q)|9AK-Q1IhnbBI^{`hyfTKBxT{?_7 z{0F2mKRo?$kWNHE1Gsfu;IQs81`9$F(yZcnB!|q9Y-?#mYJbkB^Tt zUkA@D4`VJyVx)9s&QcYmz-|*`_P17wG-ntWGfZ(SVtxR#!{I*o#KkqgfB!D&(EZSD zeAw9eTT5MPVsl|IQIV~6_6_RD=4=~t=AZuhayJ(Im9Dur*pjU}X_`cl0Z88y(mya` z=1lYV*A_;dNPI!hm00hjfLJ^7J32PrzDuXTU*y-L!plgg6qfR1^@ zkVnGWsDn($HQKT{c9jv;2OJcQm7X(eRxu(N0JGTe(9l<_B!?aJLSRsciUNqldXY&V zDpqgSv1QoxB>?Z3Bl)Gv(r%%SVGi6uxDnNge!0pPhSArTdLu~r-~=|ax}y%>Ui0OF zu3IEhJf0-WEI>-4Y5{!0k$0P-o-s1i=v596rXF)#y(H#jPqle01)1ts`W|%B@mwu% zg4Jn`aW|F=zQBU!R#qO>b?LbR7U3cC&o+2UHN;XIJ#TXV!bgYTY&wbKq=ZK-z>O>p1&M!unE|j6_;~+K-mixM$`xT- zixn%^KH2fX>*60)S7spm=qkKV`k$xxcz=x1M<8hfuu|&YZYQ9oIkVaR4~e zH*B*78v#a&F*4G%#t=}6CtNT4DiL!z+1c3!NE^1Bm3^|Pj?ew>h)iDu`MW3*yKQ*i z@RL~=!I=xOZ@By3OH&yC+^1`Q>WO%rDF8O{6FChH4GsjaG4>|=W&IP-P4b1o@oT$()Et=zp@aji-;z8#W*XcoU*8bS%0Dy#5Ajt2+N%Z}= zDJVH_8d$qDL`zlFt7we@XcWPY=7YXod|KZ=dc&(D4v5hPDRv!ZuZ;@KkUlA5pYqR7 znWku95SDFiZ|)52X+5O1sISH<)1On?i5AEYw!a#w8|@BJOR$mVmlmVW-S7tiFm~kJ zf_G(Hol0X)>^Mg;q%8y>DH7$H^tC?=0xKy~*SjpE?^*Ry?~*Cw1IW3$V(*TTfIF3j@C!fI3(}OH0eXP}uB(4Y{LK z%+byiN<47cX<}?p3Gq!4DVk?sU|-xZw1iDX*Rc2_ZsreGkO9uxm#C3O;ZdN%rUn35kapn)78po%1VJUgZM%Uw?d>EYbzlaf?&B^Z&FwpS>RJtzTd1uL3oP zLlSUOPURbO_T0Ipphzem9vf$M9Zx#*C{DrYhmK>@QVPk(;*83%nW88^Pz`0(F1NHe zg=s!(r_MUB776c4CAjcI!2vBZ-sff=cC_gJNIo5IF50HGcIOs7lWXm+Rq`zs;dtoWMw zE4Il#dh}?(edD1ZhY}t-dvfnp?dN|=M{L?(5+kw>c95XF3kG;GZhc~3f4^@;#IMD% z9Qfedbe?DtZW5!wMa4V)F=NG}v~Pwqiz>d0H`B@M<_<{_;RZM^Mc&xqg(&lQ*Z9z0 zxQ(MArOQ+8t>$ydXrVlI6e*{u%g`A|XXj`Hy+hAe(H$;1{GR#=jGr2*FB-84@;(HN z5bRnz5*sSuXv@-_?X{Dw_ro<71C##z_3PJjA9_Aw}8{Bnq(C;>4&sh z$f#hl&>4|6oZ-k7IBx->lL#IlNggIdOi)AIwrl_T{pV*|>{{j5Z5P!O%+!gHrph_P z6ch|42tXw^mB04w-_M5%!*?$Fa2rpm2zOUn-|ann_o^dsC#R(FLp}rxR0X{290@|H z(*)9_Wg4^bX>Ot1&Uv3|lh28Q7$^vlJ^`sk8E}xOUAv}IsLa@lqLtfDp2wv+CJ-+4 z^ghrne$Xwz)m5f;WkgrRoKWzgLy9lr1#kOdgnCju@T$k zB=P$lzkh$;mWHdy3VXK~p5Kndn4S%mua2}Cuu1`op!%yYI9Qv^{X9#~Zpu%hEs+b8 zVWZTjgVO4xz606n+p4Ez*-0G#ca|McP{s3YoneRH?+m*sVcWc%w*urB%N)lS2aq&)*d>(NOQTpc13>Ib|~7?e(giK@pQONgUmfmQf_h(MZNf5tuPH zaDu1_1R(dHdnP~c?zKi1gx{cSo}-gf6zX4IDJ${NQoCdBhuGWAc91)_+SuA|wpY`2 z&x=c!Eel8H(i)pN?zAX+CO+n^$irl#ie<)VuL83K0ZI+fjyRol>(-5+G;sIS;>k5Q z&zQ}P5`-GCD}xD0>??EtNmeEQ{~`k)RB7!i=W*C!sy=3yqLwCp`ZG`YEsQg zH)Bw34=SQBkOufd4G7{e64-~C;6#tZC+nj%Y5eKEpEsibq5(%}J34fhSrTZK5s9zx ze&D6gg|XpMf6ZI*keFpiK-iZDS4OHH#HJ|SP@)gWP4NkvTY;>5yM3pY`g6@AKrAxp zsG1`BAO8ETjAaE-)cVu@2a^zdfX=N0}2x6D{_MLj5b-T zgic*X+V>ESZ8Nxd)PO1hE@&BX8rJ>2_#)h>*;QbBqrlK_wyIA<3g})^mVtH~WHivl&WHWc)P5;5F?x=fcp!ZwbAq)C+gSYz zC1^8vP{^i=8`T{=P%C~0)`6(v`+~YJ@O(t-%7d8kv82N(TjKnjt+zEIR6fP_FM}WN zud~<1NJ&YN=n<7F8J#QkDSB*X5za7zX<$uC*G8Ffpv1b*Uu&wRb%Pfcp6H<94rX5^4tHI&6#NeavMMM2wk3 z-eFX&MDa-j>yu#JByZbMt!|Yu91dRNY~?S{6+(LuItSd5!`v?;K>YSjL2bvJUJs)5 zoC`ytj%>7iR|v)p{A@YW_x4gMD-tIUo=d4;;(G9u*(fBMm^?!&6KIey<$QUeVw^amZhhs2Wgu7PyJR#GRG(E`x;Jg{^niv=_yWs zqRh&p?kx#?nHL}$jT-WgZ81=Jl0A2gtN~vESHWw#mR^xl$7i}Xhz?%AWysU87@HxJ z#4$;Q4zL9>UOT@5)KNFV$uUzVzrYk^zL>(a`5++hyEMYfHSBp9B-C7_>LlKgom1)= zvd}iz&k-nfd~9~oO?>JX&*k=kuw@98t)M@nkdMhL6Bm__(CgwL_l6GeL)A?a=iBvBC^5o#%Q>6Z9Q+dW{y+~v%mHQ!@xQFU3=&QBOx*0AKd}4Ha zZvTINmFI&QeI@J+ZV5l)y<{@?^?oO9%?2(&73^KdMCOB`rpXUkA`F0QvT4p=NqS+( zwiWv8xpG#ZbPlVN(YQmOQn`ANof{mzI)WPy6Eb3j*B2+ht%YxsCvKS_H~!7i=jVDG z<_%BJGAe37Ftr)lXO!TIWCGO!=@Z_b2pgmdr1~9{th|07MY-upKX`J3i@^>?F57yX zm|_${&kY?MqF*f#WKR8rz*U}TrHiajf=`2NocoGA77Emv8T=~q6+eBFPZMsrx9Ndy z6xnIK>0QJ@0Vo+ghGB30V|}F|5t>X;V`J=dkB*U-clg`#?3sCi=Hmpee0LNT@=Fc<~9!CYiD1Um>MnUw!@^n@CK1 z?bru#W?L-6(;2@I(6Nxe@c+lai2MipskChjhzx}Rg?|s+jPqq+Kd3xA_Nu_B6m_0S z?hEPB-(MHdga9*=iaj6=Hwg=?AmuNIdMz5_LGogM$JOcH035A=M8!}q#Gm_h2s*;n zdq;J?pkVRy%n`}+9_Nhy4f2}=X=9m{6X3vnel|6c%AJ57GeI8f%f_eZGs7FTmsZqI z%3(K4moNXdtqUSZ#i#=MyR=l0p;jqu@QFLHBhzw&Sl~Yx0a=wrGrXy2 z#^us|0qBK$`2)sE%}KwAU~l>zfzYfjMf5v4oww3F%u^0c_l|A{Xb96>ivM(e$rfhH z$y*3)RCxrPQ$vju{NK)et{s+I|IrH3#Xwd#2Q^9NY=J#w*6^WIKP8=Qoa~Zl|0?vi zv75_ENlU9k!~FMMrpn=P_M>vd_z*?k4Liv!nPCnE!SPhP7;rU@shk1pL!tQZV=>(u z=du+Ab>$oCbROlh{;?dQd>WHp^h#sJq-WgH_sLimm?8$m<|Z^7;La((#$@a|Ylolx z!8!h9NS`!;vEW`XH+UEdT?KHdTuTot{nOT zh?t?il|HSK^l=V%j$c?5-CUNx`|FdFb!|V+(XW!~^c!Y|L1HsFe;pLKZngi|zA2nL z&(A~W4b?D@(5{?c()ntd9K160Y2jhdD2&L?8Pu#ie`bu9^t?ybCQba!wxv&V3Ih(g zwJsLAZt6(@Exa>%FvvM`=bjs)yS{bulAaeU;sR_}oqR3=SiY-#L3ed(IAP^exvkp&!M7w|fI>#P7x_u!U?YJWr4QpSbYA&+L ztUFIn_%FRH({73ZXd|TfL5NNe{qv$EqJ>IRVq6}bt8O+QZ8oZ?IrXx%5B=W=Y~WA6NRfCXKRq1Bde}v>DzHC;4_b>u zFUZTh9bz6K2p`yJ`qo!-xj&iA)hsk)C3M}SE` zSJXs)0WDvw%!d!TZ}QXC&ZHWwmL(M$X_JUsKq32Vk>Sk$V5FLAJow=G8lF@E&72kK z=vSZ`6WS_c0hAjCC18`i{r&L|Ct<*#Py19kApGs90+2ic4SdA=Gl;s@(a|~dViNMC zGjJ_k_pt;b{GJ?`ZwS%wxD{a5zfqC&ia9I(X_O0zyTc=}`3U~kT21LY|I!h}&d8b> z1&-V&>ktmkkxa!}oj17kFO+8fxAe^Z%P;c3)T#Ubvb*}6W*dgqhGZSoP=|Eq3o_;) z?L~MGP_p-@7~p{Y27OfT+$BwIF=V55Dk`3)L%74QmGqK|Tqr!X zBFDG0Go>}@Tvs*28zrLOpwo{+4zeb^GWw`8>TYLM)amz}mjmPGI%qBz)KP;_1T;(g zHtBQC`LF|v$3@V&tAJ4$J3DFBTeh_4E*;^v+*jwmQcC@~rO5XkTk6*MoaZ<_eNfMK z_4N!+%hrMJp?B{}G*hIj<$CpPS2L5wPOyaZ^EXKE!ejhh19O19vKTzcVnqm#oz^F; zYwGRld5QyE`XV(mZu8sH+hP}4nl0^9yUU!9SIeO*VRVekE%ZL@73jX{_wRsj79W8p z7Y8l9zT@OuHmGMS$Hk$b1~$FW4Gftkwds%Le2h+TAk zRvz&2g2I_010sHa`(t*$D1JHxCt2M;H z-`iL0^zreQv&!)cYw7Lxu-=Xp9|#FLDABi2uxiEvCT0C-#*9E(9t9eVRBATZIq)sz z|K!s>3|7j5*a9UW&`KIA2+K*w1`&%QNhkho^N#(6i=oj4&6S<)!rrsMdqC(TzB_RT zHz#hqcXd{JvCNrW@pmEu3#5Jz9iuJn_V%f}HER;9=>BIt@ec450`14>9z;iqGBOk& zn>OxJqd1A=J>n-2H>gt;Xr3f@K!u+JL^pR#8{EHocUnt}$yzCg!(fUib-yR=Y=?xf zu*&fJ!iBPDl)(f|#s?HSpxllI_4CFo$7iiDJ?9Enx>|ofNsWf#3a(pLkL|7 ztr+%;mz`TB8L3&MeOG;Tx^p5a=TOny44v3WZ=Rrf#G$pu4?H6FL775AYa9h8svoKl ztWd9ah81^fg7D?-$*9N*81ktKem*%KS>^68Pw3BwUFY57bxp*ZNWCA+D{}2bO zgq#KWlREOIR>>*M#CN8U6*VDOoxPa11>9u`0>)G5W!g~dA<@*k>}_&?sW|p%=511ED>* zi)P+ZG;J(iyS8JSAV2@-Q4-If&z^eY4n-{}Tm}-{x;ftz<#|15Xe=9ISLo+M^q)Uh zFzqBb%>0S*!Cf0QW*g6)KVK9@HKOI4VblYyNP9#eCEgB+JoPuCg$H`MV0GRg#`f)H zKR@5gi?^sMp@t{DUBtAL4h&tw!$^IDHF-E^XutH9g);(?`v>mN6y1^4*r|(#^Xkl0 zVv>@?DXdd-^-@kw0WK~sNI~-;lFG;jM^Bvq<|r3Z@eb0mZ|`sDCE(zxN2xJ!c&p=# zy~0kg=IMKCX;}t4G-SA@()0$1Y0SaGSlinhhx&Y`3EO-WF}^~;~iXe&q84W;zS93WEG%YNo#3*-_t{O`4;hO zUBYL(zwHKp&xbROc?`4f-<)Wq-g;4}2BPyWFJ7r%1{o-k??qC5o&X$8XiI^q;p*HV zh#;q5PHp`l_3W)bysQQiTe4`;V@TdjaQp^Z6K8sYcZq~7kKPpPH=8T6IE;DM2ED|j zpM`|Fu!dBdw|3MQ;~k;1U1sb%Yh%jU#;cISL?VkZfmqeYU}DM!9jMW$B^{R0irP%! zro~>@*rul9;4Br?(L!ev6obR$UqP*zqhL62DX&J^k1XfTojcT&6eoZ63XTod$1WFo zVysCyT9BHWo5MkS@Ne0oh`yv7*RQ{F-%SnQAY6}Pql2|1o-bOvc57?Jg+|gjf@Mta zFHsJ)uML?0Xu}uOmH7iXF~Ys72m9Li zGvD`kj2*<33=$H-Mskhf&aQLfKZ`0J%U%&Tr}8%qTOihxT9B>g_B^tI?a?#=)F-~> zzt+10cS&3-wD1oYr1QWwKu+p|%iZI?{QmuW6Zkl2IPSxqSG?qwIQ!0aqr>ORSFZ*@ z^sXOQm5+Tv+o79Y4-8)sEhmaFlmgTx3efR@in@6|_VgAi(f;sUA_^Y+p$;`iVNnS@ zhtiodFONa@EYz&A^BB-NpKeAPA8IVgp!S7w{6G-+VKd{vOd8*xKNs?|EHnt-g}tDh z&&BE|F&u`GWK%yW^%cvw!mfEi;x~g#vOSP|<|1HC0FncGOf)UDmlg#{=u^3knphE| zPI(IPqOlTAYh$R(DH@$}n`>96Z-5@rC@TUBj9{;k*cVqPtno0&|67p!IihQ&y%bIT zr_fY$3I_b^vRRM#Ebd4xU?3DXgfSps$Nn2jgvh15e6q`~-7iu%GlP_ZsFxW)2?R3r z!xo*iuPgM|9C#}G*#>p_(s)zVKBP*1kirL|%gi4qV5R<=y>cLO@sc{sGCjv1K zE3Ia)oXaNksifec;wHXb@a=*Ok5r<~F^ToUJo3KVaIWKvQl&4W;`5H$zmw3wj1rIJ zt5Z_^Q|0*KKSb{WnfBG5D_W`VDdE@~M(xOvP~zj?1{a%9vqYRYrjJUA$u8AJ%a?zg zVl-#@gu+kAhNyp=JbA(9*tAm=K2R33^h{p52T7O}DrMy9{n3y{@4Bj1W?mmbEtD`$ z6L!73QJJ$8+&T;L)3HiwHy_a5VO$Vk=(+A;I4?}Q84@lLJpOj{L2rX#-3Y1ez;Mqx z22xI=x96uo(R+zYSmpXq7arV^d;4Y;uhPGl2Av=2|Gid36-svQm!s z`gxPhN8T2r0Mdqj+{{_=Q8l#Z$6h&z1oBa#eJ;RpsGl_FaFVM~Ve5lx-yh2O!X;ae zm*oeD<{@|$r6K)lM$JMrXGNUzpk?W6LxsC&7&-?i(5V(A@A;2T|HkHKBa%#`mU#*k zhdv^sQN;^wbHG6>cqU5$oivjiFD@&P;NQ+D%~HuA|AuI?60Qm`#RNHkB|3c!J(J6W zODiiY-QG%|)&N=k_wFZ}QtSe%F94{i@0xDpgeGF>)lI-)o}}17dhI#O1uV#HvE)Ey zyfB$?BqxVntYThq6OB|H9g`v@dS-TVbhP=!CfIHy!UkI5_mXC^qtSXp(`LADbU=kW z{rPPQjW7vxsrM8`z<)!p=tJ4eB(VabH}&Qj=KI}w;S{+KjSKxy{BA}OhI$66Ka7AY zdUYP-odvP7Xka@E-jllSM3K_LN9R`=X72&C9eAYH_~XYCXu=5H;FO>UqK>#lnktkg zBLCU3L)Tm@Fphgj)$k$cLkuOI`k&D#8ft6+_@L@JBN8r&YRaN$ zL*IDx7NfC!`gC)GruBcaR_KFaH>C+1Xih!lnfzz7>Zwc}E@`L9K_yYb-x*kY8tcF- zEL^hUT(mUxk<<7KUnDmB_U&7t?@3|NvDx1^|6A6jV9l*T>wvR^cG?$?1A%0RgA~gt zDk`GE13;YxGy;XbN}k-ZR-E-36H>P4ber3mEu4@Dm2xaWGBGLhhnzE;MlR6L2Kjzp z&?{R%eZ~xbR54fSTi*KQfYDZnsW;ybrijCNBZhmC%XKx_3~!DJVUC!iqXYm^fY$V& zZ^P{eRsMq8Ux0BT@MQWFzL6mnKwxcvY+E0=C?CLg@fI$weU@DTAGlmTF9m8xiQgQY zp`Pb(R~))jMWB8YCb2xtP{Aaf9XTt`p(&Pi&UATXHqMa})^O3hKaW=OSC~CL{xy zvuDlP1|TLv8-Qn|CJBm{8PxP2q`59+`++QZ9vImZ;8l{66HcSXEu1Qp-xQEf`hss? zwZrl`eAQ?0edyCH6j-@z7K+jsgSOQIlj*RHHlq#IO{qw=dQT-@o}q$6}lEi5b^ zYY%$&UcPcAa^LkunD%uI>e0zJwM|6GhzmdZCMkd}!QN$r(Y+W*6AKipOrd=wvE+6jlYqfc8~yBy6B zOcku%QmEbFK;9i|dN*~t_ma-Sn-?41lL!9%xrcT{a$gwfrQ&1L^m!>w%L4U#*-Pyt zJfEcNNc`BIFVwvRB$MLU7ml)-EoSOb3k3LqBWMBXM>F5=VEg(n&?xm2%bo0``5kk7 zXg&*d$T1Vy=h19H1eM8b5nXOTtb^{fwD68kG2Q?nWj~^b3aWqNNs2r_GLNE9wd~F5 z6GTA(Mji!QGNvPh6)grpqr?buQxl+nN*2z|_29u$0d0WoiXx9iO(+nvw=CFGS6f?# zHK1Gz?vyktnC!A{<591TIoInlzcG5o%n86k{bFEQo0QcZ8CmvEkjV#*FcLI;6?);W zdwOmMMnHT2wIU|abr>yQ0q9wicy&s1tDEjiSC$OKf>0zmf%5Z;|OlbxYCj#Mffh?Pdu zpfI-aniUsh{Gx>7pqJ%^XS4x7=HToc2v&^O;Sf6GT|XIxX-)T5v!H1<@RUK)&J|Tp z->%2Air|_yeqyN412l$ApE(@~x0t`M;cHzj&$4yE0qDl49v?y&;@!0A6O9U4r$xg; z2=f9SFq~d(S|{3cbK*OMj|$VX(#nP>|*Ct%h z&VQ2nIz8s*-x3N3nxZFquKDUmMw{&C?1y8KnlP<Qd?}M9-~k32YT&6;(i1(tA)_JT>NYuU*`*^`@8) zIxWPAJ%~*o2sq%yxfPv9%!xWL3nc3_>7pFND)!Axx~4l*g%{DFxuat%ummKI;;BnC z$q1>h)orA8qsNyU0B`NPH@JCZqK6gI91=IuJMF7L`> zbhlCK)s5S?E5JtZ)TLBxfG;eGS%Bt6Y3i{CgKc9J`Etb+#uXtBp(tx;ZZJSk3~5^H zY>mqiyGMhsU?>cYV2sg8KaSWzj>$BXvkSBi;y@6T1QBrFo88Ap4@Iu5xM8({rq}_f zQ-d%P2L*I%co!I*8{!d0ngFL7gEfo+hU14h7xJ$@1Yc{)WpfgwQUIoRg-x(;>w|0^ zNBK9-{eXy>TveykUzlw51tV6T!LPNHMXFA=Bw_3uYR1)jKjF0X14Z&-EJWd^m51KB z%0j=u&o2*uj$sB*;5~l+_^}?FLP}gxZ=leRT|0qes#+4WP9p+rA(pT8iI+?X_>Z@T z{rR*;XNi`?yjW;ynfTYiQP9w33Fb!${UahI*k@7=Pt>?@4a;>Ga**AtnLL(On-gz# zRXc()`qN%jDidTj+M)?n4b2%XZ+dsCCv*_JJ31=Yk*F2(43Xgm*%_ zQ;TsbA>Tq&+7(ndrpgH~Lw*I`(gePjk^xfw&@e-s<$+~S>yp$9x&(y zF%^aU;yt8hTrSYzc5< zq{0Kd^`(*{9-*X)CwouNcN?^Rk}?s@D*D)?!IPw8EEHvV(wBkcc4NmY@Z7B#{s9tA zCH7Kp06|nCpH_`IE~<*DgR1B=Cx(J>B~5JOGJNkhaezPBb(M+-f=hsLh+_`5I)b+V zAn-&$C3g!FrA8C;^=8kTw;kJVZxngvBI0Nrc_|#Ek1}hTam;Sv)95?20_BU&v-dPZ z4+Fmfp>qJ;Fv#Pna<}(`yT>RLC%oIX6+`3j-WTK4psT9Gn}0x`1G46-H>U#~p<-0T z2)KNxyMm4hZvpm3*3b+LA_N>Qg3Lh(x$q9)&?odjcx5qye>t_1cGS8EYR8juvY=yP zL7r?r^RTo57?DeP5lg)7)vH%V@N(fGN_fS^ztZR@nYtDFZmfqV+f=Lo(p{<<&*_tX4}@nEz6f(^Yn~& z?*yS44pBO}{DEO$17XlNqM(L_h)W@^8}EH}7|YCb$gAO*xQfj8?J=aLogTsAd#|ZGL`QhzeGbOgfg{Ck#o3I@;A}EDLz^d%N7-(8qcl zFBi{YJ>|@jfc$yr+9Nav)owg;d=JOw3Eg?e(Ic0fl+*&}!rr$Uv=3!bUsavn!sckEqs2TdpNm#*nl__UPWgV0O!N@Vn1JIklnsr2)tM2kjbP51yLNxPDGgIT<*E zrX*l+EXJw=4*O#6TY^K+DFiA-nzcrr1l5e%@gSTgevH)s7WaNHSS|N;Eww0M;q_FX$Mkl^^;*Q=BRRFB+DyQ zaVUpdy6N!S>*bA16w3esuh2+oSL9Vb3?#w`VG@@)n4}{o6%K_c8ae?w?KzHrN&0g3 zB#uw2TX6ERI~q8=WvJT`CV{N>tX~7zWzxPP``19?M(Q=n4Uyl2^S-k<+T$Ll&Bww& zyCRFSU0NG5+QAZ`m!O-WkgWAsqRXH<$z6CRo+7bXMkxO9URVNax-Mv9OWrf zf!LhtzM}d$^H?(;+uz8P>mD8MqtOwfXu~h?6Uuh0xV+1b+x8|$FvlvHDyuLVq#%{} z43#;kyoN!+RP90y27`O-*z+dfacLxW_V#L}&lu(Tbh`1riIFxz_*wbPv4KDoY)nIC zPFHV7Ca;UdxgR#&SdQ(wfTnT`5|lRBEX{ctd-xp;7Zvun+j{KvzSUwD#b9ZDL4DH5VBEbCnjg!t$Cw#xf&}0YwcFxA zVsOx#nN9+yH~rx|@#Z=1Y3#nHL6jmWEn?Wj#davnG4#uL47mYGIMUOLEmhv^%=QpE zK=pIrK5FPF1;UOzs_UwbIIzLtTgWVKrN`j2j+-W1W zYbWdkP3DHi9%k9o5e5)5@V%ChH_-|3PXm_}3gaUzdn%o^&>n$^q>epW zTRroPV9*ptc`7cWmqQim9@y_@8bU|yn2>x>jTUD`5e2%0&=SgqO3Z(HvY zCWRb;`jcFTvK_Ep(BFB|cA<<%d}#G+!<%DAJLo{*mC5SI_Natukwc&{2_PI(H#XzF81&?v8HO-G(`A4y|3Ih&cW`W= zZ7xrSxr&IsC#jzyKA6PE;1{$1L@a{H;{coE9OA=ZLu8hOw7Mi0qCyOuvNLN%jn8&; zvlwH*?K<_i>!Z5*#6d2T(-@ z4P2OcfbHySm^kR*!+6X9j9mtewrKQwJ*N!y1;KYlVZlc*%!u;BXoahb2^eHfjHe<& ze~b}S;x3&%V-_fn@xd87Ke+CIv3-Y8e453@rv@z*JH!Mq(ajK05@rcIvEe68eIk9K zj{BG`vjn1%$yVg!3G!5R4m&Y(0fZu|R2v^y)onvNkAo3`39ipT#jMc^KjURxZ~&u? z+jl$)bN!~bKsLyNO4Mk4xwdcvl2Se<<P&vnad?p8{8X^lQLm}pbt3QEr)7UA>8l$)5 z_o9ngTdWaN)lKn<-rip8ugcS>Pu~P3P)Ur~4~S!sza6!-bXgnr7e`+(oE1Bz;yC+1 zz6meAX0@8*+?zBU9(FHNE{DS|jp9|v)2OinS7fFvm7@Y76$xMp)YSqmlNx4dRzF!3 z)CeT*7wDDG`oXc~mJm&Wg6HESgat;I;#mm(4>>#*tY@O~rH0zHYtp`qSt@S;5=ixh zQtad$Bp@h{;U3rcrxVrhkbhCt0pQc4^|Du4qu#(=%q1RZn^zSYrsySNFc@sI!C^)1 zwc=dNM|%hc<{A7R8gI+cto{%XpoT+uIjj_uOHASF&>$jUETwXRZU}5iH-u}T&RR$S zbM)MBQqi#`L`qaTdYCLoU;uW3I@f_i&p9=&-M&p#L0?czNcsCRvbTgvJ8Mu9MkC@8 zLiY-$bzj_n@11{=@q*5#!gZ-=15^XSd$s*4&rnrfE_`kn09*?R&X8x5y#5(fM>MxS z?8Fmobky~3R^va5;GK>n2-7*9qciYM%Y~=l%Ey}TVnApK4TD;@_bRz=_?d<4M3+*9 z0r7+iD6FcRgmy7m4NaLT!~-L>ip24v)-UQc04E4TGTFe#H8}5GdaaDXAf@z}2}H9k zG0OiwoGtQdgSe{pOW)peRGs!bg2+0X;b1Y{JA&XhIc5)m#Xax@PjNo01h?$}Y=_yH zG>!s_tv6k@9BU=R01v2=f%MOt@ZoYDhb+!L$PmM5F|pt_c?vxLqquVq%Q;WOc*=3u z972W2XpGW0g-MbqB7ZTHq@AUfE5hd3P|h;m zFhf#8Vbaci?k@h@>)N^YpZ$;IeSe4V_j&H;e(vY~k|yL*7I4rohQuVzA4Jd^I(E!M z2Q$=*`+5PvbT4xo7^X(m8xJq-WB(!QEuCJ#4wUoROjHhxv}Evosy(8B=;}#vM8pYW z>KJ*X!XG6ExH%Rklp9<&WLzYCNUE@zO7123J|)21F_()_Gn1@D)s92#KyZjlI!y)I zMyX9DCOFgyd_Mf-&ER*L5*%{U%7Oxs=af9l#c?$a(NabVwqyE7GhrQ{HJP+^PTaxZ zlgTb?PRop)s50`9&m0*g|E|K+pxLgSa7;TX1S$PVEkn^atgg9L{*SuLANAvX{lU==zm$-6z_}#}{7S0f|minaIH*)Fano ztLmA5hJQ# z@~)kbn#-)&KK8I$u0Woe8c=7bQ4xK&p1 zFT-k$i`ZXZyGG_JKM)dD?Y%g2{o0ns`8qs;>J0ZqpKzPp8y_FIcB3%GBg_)d{>p8M z-i94*osW_$q@2L;;_7FSzaz!Je)s*U+fli%|CX}Pg^Aq5LfD*Dp zWjzC3Sy4iHUPsq9^Om-B9wXubM_RRpsPTQ67rj?g^BvlghX5_PBdG3GR)o`-MH;vH zHT3Dmgq1{IZ?>EI?LZxrq#U>~0y9%z>8KHjVr#Y0bJwH-uTd2>)>45w^T1eKA~Qv7 z_xkMB3?>@NFBUPwsY)K%cJ>3$XsMA34Sf;p6gv^AR5yT?A?*%S|kXe6)vBLb- z1)d`l-m^Z$^Tqz7+dVBE3)}qnd~$t4LB%B7I%nd)EJSHJ@G+ueb+5wZBe<5*LxsrmRa2OIxPn zutuy*v&-V1n0mgbmMV!Y6Y(p`nM+`&)&F`Gss022y%ax#lzGGc!VXz#~#-v`Hv`va5uGAjk z!KqnA-Z!0EKR=dG?zN(|ZJ}?)e+a#nap|2%e6y&TpKYGYh_+mj!Lm=fP#0 z_&ZoPYw?!Sd{cLU+VPzInj-%6x}r5P*hR#C*9{HartrBLAXC z-B~osh8UB<|6j^N?B?|GW-6DX1HZ~Tt`1T7t;h*{i*KrHq}`~i6sqPnrw87!@8J>F zXT%+^RuS2I^7uDt92X5@e^dd{Oah z2-gQc!gb*X=V4{axI$iB3EQPYEQ1~(W4|XN3{0fK>Z&sZ`S~kN61`9Sml>o74r-@n_mKah+g zYCXcfBPr>ESlO#s*=ZQV20D&c!(MSbrA1S6N| zQY+LVImy-#O+({)J7twGLflOXO?e4F@@2!&w~2e;!ClY^O5$LsDOsv_s?MZx=NdFM zH&357y3@hN)G>V~Y$>Dl=00-b(dt*rl5l}VB^RXPxp%=Qy~YfVF0Ljo-Ds6>k~*B( zj0jWmtUSOC1KUsZ=Ug3uWZ5#nvIrlEKbFTKDu;#Z>kNT$BY|X7XijJr$fnxbbL7NG zaEE;t4m;d-6(UOAWr(*R9tT0q2YmXdNxv?7n?}F7UTYw}v9JgtWtg977GA{5jg|Ni zq~M3R*Je}IErw(nblJnDJGZ;mNR@CO_@()FQd#p3uFIL!N;obbSSX%;FjM)_jrpRK zIKM$>9Pm7T7#L2P6P6>um$3=+mX!1)CzeHSG>h0rkoIw4JkIWWBK>{N#JHsNL!y$9 zT*C{PCG3YyT89QvguV_qB!SCPNPf6zEs_IVVlvKt@X6tCx$F>bMZ|sd%X^mE!hi;y z6_H5`OtPBu11uu?H9c%mqtJ)=a=inpPCwPqTa<3=c$0OwrTUz1%BL5VhZPX2N$`u@rMS0LaHs%Lp_l^C7SNOMVT2 z1C7IM&u;#BD1bJtCAZ_Xu(p*ct4}FL3ULra7Ay1lyq@lryeF?OcuS9)TP^BLfuh1& z&SUH%>)~Bu%-GDqCvdxX+#IbL+)$;a`zcsey(~#;e%pQ(3NqG_UarC})L7cM!ggSt~n^0dzD-|QvT2t#==Dv?OL0-E>QDUeO1ae`Ysoyn8vWR##fGO+S0Vh z^CN2apq#C)@-cB;%nHux7?5LfyG?)pynaSA_dQKNS>nzxQ}UmB7bIS9a?VSR;g9}< z``R%gXTfz_FXFU=bP_}eYBk)xjiZuU`It8Oju|@X4#6n=YK933;cvDve3)wseQ=A{ kCxagRzl=|x{mm?7^s7LpL0z_UMJ5x+P4(PAHgNsF0n%RxzyJUM diff --git a/joss_paper/figure_1.ipynb b/joss_paper/figure_1.ipynb deleted file mode 100644 index 3bb22ff..0000000 --- a/joss_paper/figure_1.ipynb +++ /dev/null @@ -1,78 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generate iteration speed figure for JOSS paper. Iteration times come from https://becksteinlab.github.io/zarrtraj/#/" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "\n", - "labels = ['SSD, XTC', 'SSD, H5MD', 'AWS S3, H5MD', 'SSD, ZarrMD', 'AWS S3, ZarrMD']\n", - "values = [1.49, 4.76, 10.30,3.10, 6.53] \n", - "colors = ['#009e73', '#e69f00', '#e69f00','#56b4e9', '#56b4e9']\n", - "\n", - "plt.figure(figsize=(8, 6))\n", - "plt.bar(labels, values, color=colors)\n", - "\n", - "\n", - "plt.title('Comparison of Trajectory Iteration Speed by Storage Medium')\n", - "plt.ylabel('Time (minutes)')\n", - "\n", - "\n", - "plt.xticks(rotation=45, ha='right')\n", - "plt.tight_layout()\n", - "\n", - "plt.savefig('benchmark.png')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "zarrtraj", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/joss_paper/paper.bib b/joss_paper/paper.bib deleted file mode 100644 index 56e687e..0000000 --- a/joss_paper/paper.bib +++ /dev/null @@ -1,178 +0,0 @@ -@article{FAIR:2019, - title = {Make scientific data FAIR}, - volume = {570}, - ISSN = {1476-4687}, - url = {http://dx.doi.org/10.1038/d41586-019-01720-7}, - DOI = {10.1038/d41586-019-01720-7}, - number = {7759}, - journal = {Nature}, - publisher = {Springer Science and Business Media LLC}, - author = {Stall, Shelley and Yarmey, Lynn and Cutcher-Gershenfeld, Joel and Hanson, Brooks and Lehnert, Kerstin and Nosek, Brian and Parsons, Mark and Robinson, Erin and Wyborn, Lesley}, - year = {2019}, - month = jun, - pages = {27–29} -} - -@misc{FoldingAtHome:2020, - title = {Foldingathome COVID-19 Datasets}, - url = {https://registry.opendata.aws/foldingathome-covid19}, - note = {Accessed: September 25, 2024} -} - -@article{H5MD:2014, - title = {H5MD: A structured, efficient, and portable file format for molecular data}, - journal = {Computer Physics Communications}, - volume = {185}, - number = {6}, - pages = {1546-1553}, - year = {2014}, - issn = {0010-4655}, - doi = {https://doi.org/10.1016/j.cpc.2014.01.018}, - url = {https://www.sciencedirect.com/science/article/pii/S0010465514000447}, - author = {Pierre {de Buyl} and Peter H. Colberg and Felix Höfling}, - keywords = {Molecular simulation, HDF5}, - abstract = {We propose a new file format named “H5MD” for storing molecular simulation data, such as trajectories of particle positions and velocities, along with thermodynamic observables that are monitored during the course of the simulation. H5MD files are HDF5 (Hierarchical Data Format) files with a specific hierarchy and naming scheme. Thus, H5MD inherits many benefits of HDF5, e.g., structured layout of multi-dimensional datasets, data compression, fast and parallel I/O, and portability across many programming languages and hardware platforms. H5MD files are self-contained, and foster the reproducibility of scientific data and the interchange of data between researchers using different simulation programs and analysis software. In addition, the H5MD specification can serve for other kinds of data (e.g. experimental data) and is extensible to supplemental data, or may be part of an enclosing file structure.} -} - -@inproceedings{H5MDReader:2021, - address = {Austin, TX}, - title = {{MPI}-parallel {Molecular} {Dynamics} {Trajectory} {Analysis} with the {H5MD} {Format} in the {MDAnalysis} {Python} {Package}}, - url = {https://conference.scipy.org/proceedings/scipy2021/edis_jakupovic.html}, - doi = {10.25080/majora-1b6fd038-005}, - abstract = {Molecular dynamics (MD) computer simulations help elucidate details of the molecular processes in complex biological systems, from protein dynamics to drug discovery. One major issue is that these MD simulation files are now commonly terabytes in size, which means analyzing the data from these files becomes a painstakingly expensive task. In the age of national supercomputers, methods of parallel analysis are becoming a necessity for the efficient use of time and high performance computing (HPC) resources but for any approach to parallel analysis, simply reading the file from disk becomes the performance bottleneck that limits overall analysis speed. One promising way around this file I/O hurdle is to use a parallel message passing interface (MPI) implementation with the HDF5 (Hierarchical Data Format 5) file format to access a single file simultaneously with numerous processes on a parallel file system. Our previous feasibility study suggested that this combination can lead to favorable parallel scaling with hundreds of CPU cores, so we implemented a fast and user-friendly HDF5 reader (the H5MDReader class) that adheres to H5MD (HDF5 for Molecular Dynamics) specifications. We made H5MDReader (together with a H5MD output class H5MDWriter) available in the MDAnalysis library, a Python package that simplifies the process of reading and writing various popular MD file formats by providing a streamlined user-interface that is independent of any specific file format. We benchmarked H5MDReader's parallel file reading capabilities on three HPC clusters: ASU Agave, SDSC Comet, and PSC Bridges. The benchmark consisted of a simple split-apply-combine scheme of an I/O bound task that split a 90k frame (113 GiB) coordinate trajectory into chunks for processes, where each process performed the commonly used RMSD (root mean square distance after optimal structural superposition) calculation on their chunk of data, and then gathered the results back to the root process. For baseline performance, we found maximum I/O speedups at 2 full nodes, with Agave showing 20x, and a maximum computation speedup on Comet of 373x on 384 cores (all three HPCs scaled well in their computation task). We went on to test a series of optimizations attempting to speed up I/O performance, including adjusting file system stripe count, implementing a masked array feature that only loads relevant data for the computation task, front loading all I/O by loading the entire trajectory into memory, and manually adjusting the HDF5 dataset chunk shapes. We found the largest improvement in I/O performance by optimizing the chunk shape of the HDF5 datasets to match the iterative access pattern of our analysis benchmark. With respect to baseline serial performance, our best result was a 98x speedup at 112 cores on ASU Agave. In terms of absolute time saved, the analysis went from 4623 seconds in the baseline serial run to 47 seconds in the parallel, properly chunked run. Our results emphasize the fact that file I/O is not just dependent on the access pattern of the file, but more so the synergy between access pattern and the layout of the file on disk.}, - urldate = {2021-07-05}, - booktitle = {Proceedings of the 20th {Python} in {Science} {Conference}}, - author = {Jakupovic, Edis and Beckstein, Oliver}, - editor = {Agarwal, Meghann and Calloway, Chris and Niederhut, Dillon and Shupe, David}, - year = {2021}, - pages = {40--48}, -} - -@INPROCEEDINGS{MDAKits:2023, - title = "{MDAKits}: A framework for {FAIR-compliant} molecular - simulation analysis", - booktitle = "Proceedings of the Python in Science Conference", - author = "Alibay, Irfan and Wang, Lily and Naughton, Fiona and Kenney, - Ian and Barnoud, Jonathan and Gowers, Richard and Beckstein, - Oliver", - publisher = "SciPy", - pages = "76--84", - year = 2023, - conference = "Python in Science Conference", - location = "Austin, Texas" -} - - -@InProceedings{MDAnalysis:2016, - author = { {R}ichard {J}. {G}owers and {M}ax {L}inke and {J}onathan {B}arnoud and {T}yler {J}. {E}. {R}eddy and {M}anuel {N}. {M}elo and {S}ean {L}. {S}eyler and {J}an {D}omański and {D}avid {L}. {D}otson and {S}ébastien {B}uchoux and {I}an {M}. {K}enney and {O}liver {B}eckstein }, - title = { {M}{D}{A}nalysis: {A} {P}ython {P}ackage for the {R}apid {A}nalysis of {M}olecular {D}ynamics {S}imulations }, - booktitle = { {P}roceedings of the 15th {P}ython in {S}cience {C}onference }, - pages = { 98 - 105 }, - year = { 2016 }, - editor = { {S}ebastian {B}enthall and {S}cott {R}ostrup }, - doi = { 10.25080/Majora-629e541a-00e } -} - - -@article{MDAnalysis:2011, - author = {Michaud-Agrawal, Naveen and Denning, Elizabeth J. and Woolf, Thomas B. and Beckstein, Oliver}, - title = {MDAnalysis: A toolkit for the analysis of molecular dynamics simulations}, - journal = {Journal of Computational Chemistry}, - volume = {32}, - number = {10}, - pages = {2319-2327}, - keywords = {molecular dynamics simulations, analysis, proteins, object-oriented design, software, membrane systems, Python programming language}, - doi = {https://doi.org/10.1002/jcc.21787}, - url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/jcc.21787}, - eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/jcc.21787}, - abstract = {Abstract MDAnalysis is an object-oriented library for structural and temporal analysis of molecular dynamics (MD) simulation trajectories and individual protein structures. It is written in the Python language with some performance-critical code in C. It uses the powerful NumPy package to expose trajectory data as fast and efficient NumPy arrays. It has been tested on systems of millions of particles. Many common file formats of simulation packages including CHARMM, Gromacs, Amber, and NAMD and the Protein Data Bank format can be read and written. Atoms can be selected with a syntax similar to CHARMM's powerful selection commands. MDAnalysis enables both novice and experienced programmers to rapidly write their own analytical tools and access data stored in trajectories in an easily accessible manner that facilitates interactive explorative analysis. MDAnalysis has been tested on and works for most Unix-based platforms such as Linux and Mac OS X. It is freely available under the GNU General Public License from http://mdanalysis.googlecode.com. © 2011 Wiley Periodicals, Inc. J Comput Chem 2011}, - year = {2011} -} - -@article {MDverse:2024, - article_type = {journal}, - title = {MDverse, shedding light on the dark matter of molecular dynamics simulations}, - author = {Tiemann, Johanna KS and Szczuka, Magdalena and Bouarroudj, Lisa and Oussaren, Mohamed and Garcia, Steven and Howard, Rebecca J and Delemotte, Lucie and Lindahl, Erik and Baaden, Marc and Lindorff-Larsen, Kresten and Chavent, Matthieu and Poulain, Pierre}, - editor = {Haider, Shozeb and Cui, Qiang}, - volume = 12, - year = 2024, - month = {aug}, - pub_date = {2024-08-30}, - pages = {RP90061}, - citation = {eLife 2024;12:RP90061}, - doi = {10.7554/eLife.90061}, - url = {https://doi.org/10.7554/eLife.90061}, - abstract = {The rise of open science and the absence of a global dedicated data repository for molecular dynamics (MD) simulations has led to the accumulation of MD files in generalist data repositories, constituting the \textit{dark matter of MD} — data that is technically accessible, but neither indexed, curated, or easily searchable. Leveraging an original search strategy, we found and indexed about 250,000 files and 2000 datasets from Zenodo, Figshare and Open Science Framework. With a focus on files produced by the Gromacs MD software, we illustrate the potential offered by the mining of publicly available MD data. We identified systems with specific molecular composition and were able to characterize essential parameters of MD simulation such as temperature and simulation length, and could identify model resolution, such as all-atom and coarse-grain. Based on this analysis, we inferred metadata to propose a search engine prototype to explore the MD data. To continue in this direction, we call on the community to pursue the effort of sharing MD data, and to report and standardize metadata to reuse this valuable matter.}, - keywords = {molecular dynamics, simulation, modeling, FAIR}, - journal = {eLife}, - issn = {2050-084X}, - publisher = {eLife Sciences Publications, Ltd}, -} - -@Article{NumPy:2020, - title = {Array programming with {NumPy}}, - author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. - van der Walt and Ralf Gommers and Pauli Virtanen and David - Cournapeau and Eric Wieser and Julian Taylor and Sebastian - Berg and Nathaniel J. Smith and Robert Kern and Matti Picus - and Stephan Hoyer and Marten H. van Kerkwijk and Matthew - Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del - R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre - G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and - Warren Weckesser and Hameer Abbasi and Christoph Gohlke and - Travis E. Oliphant}, - year = {2020}, - month = sep, - journal = {Nature}, - volume = {585}, - number = {7825}, - pages = {357--362}, - doi = {10.1038/s41586-020-2649-2}, - publisher = {Springer Science and Business Media {LLC}}, - url = {https://doi.org/10.1038/s41586-020-2649-2} -} - -@ARTICLE{PANGEO:2022, - AUTHOR={Stern, Charles and Abernathey, Ryan and Hamman, Joseph and Wegener, Rachel and Lepore, Chiara and Harkins, Sean and Merose, Alexander }, - - TITLE={Pangeo Forge: Crowdsourcing Analysis-Ready, Cloud Optimized Data Production}, - - JOURNAL={Frontiers in Climate}, - - VOLUME={3}, - - YEAR={2022}, - - URL={https://www.frontiersin.org/journals/climate/articles/10.3389/fclim.2021.782909}, - - DOI={10.3389/fclim.2021.782909}, - - ISSN={2624-9553}, - - ABSTRACT={

Pangeo Forge is a new community-driven platform that accelerates science by providing high-level recipe frameworks alongside cloud compute infrastructure for extracting data from provider archives, transforming it into analysis-ready, cloud-optimized (ARCO) data stores, and providing a human- and machine-readable catalog for browsing and loading. In abstracting the scientific domain logic of data recipes from cloud infrastructure concerns, Pangeo Forge aims to open a door for a broader community of scientists to participate in ARCO data production. A wholly open-source platform composed of multiple modular components, Pangeo Forge presents a foundation for the practice of reproducible, cloud-native, big-data ocean, weather, and climate science without relying on proprietary or cloud-vendor-specific tooling.

} -} - -@inproceedings{ParallelAnalysis:2010, - author = {Tu, Tiankai and Rendleman, Charles A. and Miller, Patrick J. and Sacerdoti, Federico and Dror, Ron O. and Shaw, David E.}, - title = {Accelerating parallel analysis of scientific simulation data via Zazen}, - year = {2010}, - publisher = {USENIX Association}, - address = {USA}, - abstract = {As a new generation of parallel supercomputers enables researchers to conduct scientific simulations of unprecedented scale and resolution, terabyte-scale simulation output has become increasingly commonplace. Analysis of such massive data sets is typically I/O-bound: many parallel analysis programs spend most of their execution time reading data from disk rather than performing useful computation. To overcome this I/O bottleneck, we have developed a new data access method. Our main idea is to cache a copy of simulation output files on the local disks of an analysis cluster's compute nodes, and to use a novel task-assignment protocol to co-locate data access with computation. We have implemented our methodology in a parallel disk cache system called Zazen. By avoiding the overhead associated with querying metadata servers and by reading data in parallel from local disks, Zazen is able to deliver a sustained read bandwidth of over 20 gigabytes per second on a commodity Linux cluster with 100 nodes, approaching the optimal aggregated I/O bandwidth attainable on these nodes. Compared with conventional NFS, PVFS2, and Hadoop/HDFS, respectively, Zazen is 75, 18, and 6 times faster for accessing large (1-GB) files, and 25, 13, and 85 times faster for accessing small (2-MB) files. We have deployed Zazen in conjunction with Anton--a special-purpose supercomputer that dramatically accelerates molecular dynamics (MD) simulations-- and have been able to accelerate the parallel analysis of terabyte-scale MD trajectories by about an order of magnitude.}, - booktitle = {Proceedings of the 8th USENIX Conference on File and Storage Technologies}, - pages = {10}, - numpages = {1}, - location = {San Jose, California}, - series = {FAST'10} -} - -@misc{Zarr:2024, - doi = {10.5281/ZENODO.3773449}, - url = {https://zenodo.org/doi/10.5281/zenodo.3773449}, - author = {Alistair Miles, and jakirkham, and M Bussonnier, and Josh Moore, and Dimitri Papadopoulos Orfanos, and Davis Bennett, and David Stansby, and Joe Hamman, and James Bourbeau, and Andrew Fulton, and Gregory Lee, and Ryan Abernathey, and Norman Rzepka, and Zain Patel, and Mads R. B. Kristensen, and Sanket Verma, and Saransh Chopra, and Matthew Rocklin, and AWA BRANDON AWA, and Max Jones, and Martin Durant, and Elliott Sales de Andrade, and Vincent Schut, and raphael dussin, and Shivank Chaudhary, and Chris Barnes, and Juan Nunez-Iglesias, and shikharsg, }, - title = {zarr-developers/zarr-python: v3.0.0-alpha}, - publisher = {Zenodo}, - year = {2024}, - copyright = {Creative Commons Attribution 4.0 International} -} - diff --git a/joss_paper/paper.md b/joss_paper/paper.md deleted file mode 100644 index c7da231..0000000 --- a/joss_paper/paper.md +++ /dev/null @@ -1,120 +0,0 @@ ---- -title: 'Zarrtraj: A Python package for streaming molecular dynamics trajectories from cloud services' -tags: - - streaming - - molecular-dynamics - - file-format - - mdanalysis - - zarr -authors: - - name: Lawson Woods - orcid: 0009-0003-0713-4167 - affiliation: 1 - - name: Hugo Macdermott-Opeskin - orcid: 0000-0002-7393-7457 - affiliation: 1 - - name: Edis Jakupovic - affiliation: 1 - - name: Yuxuan Zhuang - orcid: 0000-0003-4390-8556 - affiliations: 1 - - name: Richard J Gowers - orcid: 0000-0002-3241-1846 - affiliations: 1 - - name: Oliver Beckstein - orcid: 000-0003-1340-0831 - affiliation: 1 -affiliations: - - name: Placeholder - index: 1 -date: 22 September 2024 -bibliography: paper.bib ---- - -# Summary - -Molecular dynamics (MD) simulations provide a microscope into the behavior of -atomic-scale environments otherwise prohibitively difficult to observe, however, -the resulting trajectory data is too often siloed in a single institutions' -HPC environment, rendering it unusable by the broader scientific community. -Zarrtraj enables these trajectories to be read directly from cloud storage providers -like AWS, Google Cloud, and Microsoft Azure into MDAnalysis, a popular Python -package for analyzing trajectory data, providing a method to open up access to -trajectory data to anyone with an internet connection. Enabling cloud streaming -for MD trajectories empowers easier replication of published analysis results, -analyses of large, conglomerate datasets from different sources, and training -machine learning models without downloading and storing trajectory data. - -# Statement of need - -The computing power in HPC environments has increased to the point where -running simulation algorithms is often no longer the constraint in -obtaining scientific insights from molecular dynamics trajectory data. -Instead, the ability to process, analyze and share large volumes of data provide -new constraints on research in this field. - -Other groups in the field recognize this same need for adherence to -FAIR principles [@FAIR:2019] including the MDDB (Molecular Dynamics Data Bank), an EU-scale -repository for biosimulation data [@MDDB:2024] and MDverse, a prototype search engine -for publicly-available Gromacs simulation data [@MDverse:2024]. -While these efforts currently offer prototype solutions for indexing and -searching MD trajectory data, the problem of efficiently distributing the data remains. - -Though exposing download links on the open internet offers a simple solution to this problem, -on-disk representations of molecular dynamics trajectories often range in size -with large datasets up to TBs in scale [@ParallelAnalysis:2010] [@FoldingAtHome:2020], -so a solution which could prevent this -duplication of storage and unnecessary download step would provide greater utility -for the computational molecular sciences ecosystem. - -Enter `Zarrtraj`, the first fully-functioning tool to our knowledge that allows -streaming trajectories into analysis software using an established trajectory format. -`Zarrtraj` is implemented as an `MDAnalysis` [@MDAnalysis:2016] `MDAKit` [@MDAKits:2023] that -enables streaming MD trajectories in the popular `HDF5`-based H5MD format [@H5MD:2014] -from AWS S3, Google Cloud Buckets, and Azure Blob Storage & Data Lakes without ever downloading them. -This is possible thanks to the `Zarr` [@Zarr:2024] package which allows -streaming array-like data from a variety of storage mediums and [Kerchunk](https://github.com/fsspec/kerchunk), -which extends the capability of `Zarr` by allowing it to read `HDF5` files. -Because it implements the standard `MDAnalysis` trajectory reader API, -`Zarrtraj` can leverage `Zarr`'s ability to read a file in parallel to perform analysis -algorithms in parallel using the "split-apply-combine" paradigm. In addition to the `H5MD` format, -`Zarrtraj` can stream and write trajectories in the experimental `ZarrMD` -format, which ports the `H5MD` layout to the `Zarr` filetype. - -One imported, `Zarrtraj` allows passing trajectory URLs just like ordinary files: -```python -import zarrtraj -import MDAnalysis as mda - -u = mda.Universe("sample_topology.top", "s3://sample-bucket-name/trajectory.h5md") -``` -Initial benchmarks show that `Zarrtraj` can iterate -through an AWS S3 cloud trajectory (load into memory one frame at a time) -at roughly 1/2 or 1/3 the speed it can iterate through the same trajectory from disk and roughly -1/5 to 1/10 the speed it can iterate through the same trajectory on disk in XTC format \autoref{fig:benchmark}. -However, it should be noted that this speed is influenced by network latency and that -writing parallelized algorithms can offset this loss of speed. - -![Benchmarks performed on a machine with 2 Intel Xeon 2.00GHz CPUs, 32GB of RAM, and an SSD configured with RAID 0.\label{fig:benchmark}](benchmark.png) - -With `Zarrtraj`, we envision research groups making their data publicly available -via a cloud URL so that anyone can reuse their trajectories and reproduce their results. -Large databases, like MDDB and MDverse, can expose a URL associated with each -trajectory in their databases so that users can make a query and immediately use the resulting -trajectories to run an analysis on the hits that match their search. Groups seeking to -collect a large volume of trajectory data to train machine learning models can make use -of our tool to efficiently and inexpensively obtain the data they need from these published -URLs. - -This work builds on the existing `MDAnalysis` `H5MDReader` -[@H5MDReader:2021], and similarly uses `NumPy` [@NumPy:2020] as a common interface in-between `MDAnalysis` -and the file storage medium. `Zarrtraj` was inspired and made possible by similar efforts in the -geosciences community to align data practices with FAIR principles [@PANGEO:2022]. - - -# Acknowledgements -Thank you to Dr. Jenna Swarthout Goddard for supporting the GSoC program at MDAnalysis. -Thank you to Martin Durant, author of Kerchunk, for helping refine and merge features in his upstream codebase -necessary for this project. LW was a participant in the Google Summer of Code 2024 program. - -# References \ No newline at end of file