diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..5ac8d62 Binary files /dev/null and b/.DS_Store differ diff --git a/.github/workflows/DocCleanup.yml b/.github/workflows/DocCleanup.yml new file mode 100644 index 0000000..5be23b9 --- /dev/null +++ b/.github/workflows/DocCleanup.yml @@ -0,0 +1,33 @@ +name: Doc Preview Cleanup + +on: + pull_request: + types: [closed] + +# Ensure that only one "Doc Preview Cleanup" workflow is force pushing at a time +concurrency: + group: doc-preview-cleanup + cancel-in-progress: false + +jobs: + doc-preview-cleanup: + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout gh-pages branch + uses: actions/checkout@v4 + with: + ref: gh-pages + - name: Delete preview and history + push changes + run: | + if [ -d "${preview_dir}" ]; then + git config user.name "Documenter.jl" + git config user.email "documenter@juliadocs.github.io" + git rm -rf "${preview_dir}" + git commit -m "delete preview" + git branch gh-pages-new $(echo "delete history" | git commit-tree HEAD^{tree}) + git push --force origin gh-pages-new:gh-pages + fi + env: + preview_dir: previews/PR${{ github.event.number }} diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml new file mode 100644 index 0000000..b99aad2 --- /dev/null +++ b/.github/workflows/Documenter.yml @@ -0,0 +1,22 @@ +name: Documenter +on: + push: + branches: + - main + tags: '*' + pull_request: +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@latest + with: + version: '1.9.4' + - name: Install dependencies + run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' + - name: Build and deploy + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + run: julia --project=docs/ docs/make.jl diff --git a/.gitignore b/.gitignore index a65b14a..60f5510 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ examples/.ipynb_checkpoints *.svg *.png *.dot +!logo.png diff --git a/LICENSE b/LICENSE index 5170dc3..1021cbd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022 SLIM Group @ Georgia Institute of Technology +Copyright (c) 2024 SLIM Group @ Georgia Institute of Technology Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Project.toml b/Project.toml index 5e516c0..f30fa2e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,23 +1,7 @@ name = "ParametricOperators" uuid = "db9e0614-c73c-4112-a40c-114e5b366d0d" authors = ["Richard Rex ", "Thomas Grady "] -version = "1.1.2" - -[compat] -julia = "1.9.4" -CUDA = "5.2.0" -ChainRulesCore = "1.23.0" -Combinatorics = "1.0.2" -DataStructures = "0.18.18" -FFTW = "1.8.0" -Flux = "0.14.15" -GraphViz = "0.2.0" -JSON = "0.21.4" -LRUCache = "1.6.1" -LaTeXStrings = "1.3.1" -MPI = "0.20.19" -Match = "2.0.0" -OMEinsum = "0.8.1" +version = "1.1.3" [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" @@ -35,3 +19,19 @@ Match = "7eb4fadd-790c-5f42-8a69-bfa0b872bfbf" OMEinsum = "ebe7aa44-baf0-506c-a96f-8464559b3922" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[compat] +CUDA = "5.2.0" +ChainRulesCore = "1.23.0" +Combinatorics = "1.0.2" +DataStructures = "0.18.18" +FFTW = "1.8.0" +Flux = "0.14.15" +GraphViz = "0.2.0" +JSON = "0.21.4" +LRUCache = "1.6.1" +LaTeXStrings = "1.3.1" +MPI = "0.20.19" +Match = "2.0.0" +OMEinsum = "0.8.1" +julia = "1.9.4" diff --git a/README.md b/README.md index 6f902f7..ad12213 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,15 @@ # ParametricOperators.jl -[![][license-img]][license-status] +[![][license-img]][license-status] +[![Documenter](https://github.com/slimgroup/ParametricOperators.jl/actions/workflows/Documenter.yml/badge.svg)](https://github.com/slimgroup/ParametricOperators.jl/actions/workflows/Documenter.yml) +[![TagBot](https://github.com/slimgroup/ParametricOperators.jl/actions/workflows/TagBot.yml/badge.svg)](https://github.com/slimgroup/ParametricOperators.jl/actions/workflows/TagBot.yml) + -ParametricOperators.jl is a Julia Language-based scientific library designed to facilitate the creation and manipulation of tensor operations involving large-scale data using Kronecker products. It provides an efficient and mathematically consistent way to express tensor programs and distribution in the context of machine learning. +`ParametricOperators.jl` is a Julia Language-based scientific library designed to facilitate the creation and manipulation of tensor operations involving large-scale data using Kronecker products. It provides an efficient and mathematically consistent way to express tensor programs and distribution in the context of machine learning. + +> [!NOTE] +> [`ParametericDFNOs.jl`](https://github.com/slimgroup/ParametericDFNOs.jl/) is built on `ParametricOperators.jl` ## Features - Kronecker Product Operations: Implement tensor operations using Kronecker products for linear operators acting along multiple dimensions. @@ -16,200 +22,15 @@ ParametricOperators.jl is a Julia Language-based scientific library designed to ```julia julia> using Pkg - julia> Pkg.activate("path/to/your/project") + julia> Pkg.activate("path/to/your/environment") julia> Pkg.add("ParametricOperators") ``` This will add `ParametricOperators.jl` as dependency to your project -## Examples - -### 1. FFT of 3D Tensor - -```julia -using Pkg -Pkg.activate("./path/to/your/environment") - -using ParametricOperators - -T = Float32 - -gt, gx, gy = 100, 100, 100 - -# Define a transform along each dimension -Ft = ParDFT(T, gt) -Fx = ParDFT(Complex{T}, gx) -Fy = ParDFT(Complex{T}, gy) - -# Create a Kronecker operator than chains together the transforms -F = Fy ⊗ Fx ⊗ Ft - -# Apply the transform on a random input -x = rand(T, gt, gx, gy) |> gpu -y = F * vec(x) -``` - -### 2. Distributed FFT of a 3D Tensor: - -Make sure to add necessary dependencies. You might also need to load a proper MPI implementation based on your hardware. - -```julia -julia> using Pkg -julia> Pkg.activate("path/to/your/project") -julia> Pkg.add("MPI") -julia> Pkg.add("CUDA") -``` - -Copy the following code into a `.jl` file -```julia -using Pkg -Pkg.activate("./path/to/your/environment") - -using ParametricOperators -using CUDA -using MPI - -MPI.Init() - -comm = MPI.COMM_WORLD -rank = MPI.Comm_rank(comm) -size = MPI.Comm_size(comm) - -# Julia requires you to manually assign the gpus, modify to your case. -CUDA.device!(rank % 4) -partition = [1, 1, size] - -T = Float32 - -# Define your Global Size and Data Partition -gt, gx, gy = 100, 100, 100 -nt, nx, ny = [gt, gx, gy] .÷ partition - -# Define a transform along each dimension -Ft = ParDFT(T, gt) -Fx = ParDFT(Complex{T}, gx) -Fy = ParDFT(Complex{T}, gy) - -# Create and distribute the Kronecker operator than chains together the transforms -F = Fy ⊗ Fx ⊗ Ft -F = distribute(F, partition) - -# Apply the transform on a random input -x = rand(T, nt, nx, ny) |> gpu -y = F * vec(x) - -MPI.Finalize() -``` - -You can run the above by doing: - -`srun -n N_TASKS julia code_above.jl` - -### 3. Parametrized Convolution on 3D Tensor - -Make sure to add necessary dependencies to compute the gradient - -```julia -julia> using Pkg -julia> Pkg.activate("path/to/your/project") -julia> Pkg.add("Zygote") -``` - -```julia -using Pkg -Pkg.activate("./path/to/your/environment") - -using ParametricOperators -using Zygote - -T = Float32 - -gt, gx, gy = 100, 100, 100 - -# Define a transform along each dimension -St = ParMatrix(T, gt, gt) -Sx = ParMatrix(T, gx, gx) -Sy = ParMatrix(T, gy, gy) - -# Create a Kronecker operator than chains together the transforms -S = Sy ⊗ Sx ⊗ St - -# Parametrize our transform -θ = init(S) |> gpu - -# Apply the transform on a random input -x = rand(T, gt, gx, gy) |> gpu -y = S(θ) * vec(x) - -# Compute the gradient wrt some objective of our parameters -θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ) -``` - -### 4. Distributed Parametrized Convolution of a 3D Tensor: - -Make sure to add necessary dependencies. You might also need to load a proper MPI implementation based on your hardware. - -```julia -julia> using Pkg -julia> Pkg.activate("path/to/your/project") -julia> Pkg.add("MPI") -julia> Pkg.add("CUDA") -julia> Pkg.add("Zygote") -``` - -Copy the following code into a `.jl` file -```julia -using Pkg -Pkg.activate("./path/to/your/environment") - -using ParametricOperators -using CUDA -using MPI - -MPI.Init() - -comm = MPI.COMM_WORLD -rank = MPI.Comm_rank(comm) -size = MPI.Comm_size(comm) - -# Julia requires you to manually assign the gpus, modify to your case. -CUDA.device!(rank % 4) -partition = [1, 1, size] - -T = Float32 - -# Define your Global Size and Data Partition -gt, gx, gy = 100, 100, 100 -nt, nx, ny = [gt, gx, gy] .÷ partition - -# Define a transform along each dimension -St = ParMatrix(T, gt, gt) -Sx = ParMatrix(T, gx, gx) -Sy = ParMatrix(T, gy, gy) - -# Create and distribute the Kronecker operator than chains together the transforms -S = Sy ⊗ Sx ⊗ St -S = distribute(S, partition) - -# Parametrize our transform -θ = init(S) |> gpu - -# Apply the transform on a random input -x = rand(T, nt, nx, ny) |> gpu -y = S(θ) * vec(x) - -# Compute the gradient wrt some objective of our parameters -θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ) - -MPI.Finalize() -``` - -You can run the above by doing: - -`srun -n N_TASKS julia code_above.jl` - +Check out the [Documentation](https://slimgroup.github.io/ParametricOperators.jl) for more or get started by running some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples)! ## Issues diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000..2d00b61 Binary files /dev/null and b/docs/.DS_Store differ diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..a303fff --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +build/ +site/ diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..1b44072 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,3 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +ParametricOperators = "db9e0614-c73c-4112-a40c-114e5b366d0d" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..ddde293 --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,31 @@ +using Documenter +using ParametricOperators + +makedocs( + sitename = "ParametricOperators.jl", + format = Documenter.HTML(), + # modules = [ParametricOperators], + pages=[ + "Introduction" => "index.md", + "Quick Start" => "quickstart.md", + "Distribution" => "distribution.md", + "Examples" => [ + "3D FFT" => "examples/3D_FFT.md", + "Distributed 3D FFT" => "examples/3D_DFFT.md", + "3D Conv" => "examples/3D_Conv.md", + "Distributed 3D Conv" => "examples/3D_DConv.md", + ], + "API" => "api.md", + "Future Work" => "future.md", + "Citation" => "citation.md" + ] +) + +# Automatically deploy documentation to gh-pages. +deploydocs( + repo = "github.com/slimgroup/ParametricOperators.jl.git", + push_preview=true, + devurl = "dev", + devbranch = "main", + versions = ["dev" => "dev", "stable" => "v^"], +) diff --git a/docs/src/.DS_Store b/docs/src/.DS_Store new file mode 100644 index 0000000..4c6c70d Binary files /dev/null and b/docs/src/.DS_Store differ diff --git a/docs/src/api.md b/docs/src/api.md new file mode 100644 index 0000000..4b3d545 --- /dev/null +++ b/docs/src/api.md @@ -0,0 +1,3 @@ +### API usage for different operators + +Coming soon... diff --git a/docs/src/assets/.DS_Store b/docs/src/assets/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/docs/src/assets/.DS_Store differ diff --git a/docs/src/assets/logo.png b/docs/src/assets/logo.png new file mode 100644 index 0000000..0b6138a Binary files /dev/null and b/docs/src/assets/logo.png differ diff --git a/docs/src/citation.md b/docs/src/citation.md new file mode 100644 index 0000000..7773336 --- /dev/null +++ b/docs/src/citation.md @@ -0,0 +1,12 @@ +### Citation + +If you use `ParametricOperators.jl`, please cite the following: +``` +@presentation {rex2023ML4SEISMIClsp, + title = {Large-scale parametric PDE approximations with model-parallel Fourier neural operators}, + year = {2023}, + month = {11}, + url = {https://slim.gatech.edu/Publications/Public/Conferences/ML4SEISMIC/2023/rex2023ML4SEISMIClsp}, + author = {Richard Rex and Thomas J. Grady II and Rishi Khan and Ziyi Yin and Felix J. Herrmann} +} +``` diff --git a/docs/src/distribution.md b/docs/src/distribution.md new file mode 100644 index 0000000..e222621 --- /dev/null +++ b/docs/src/distribution.md @@ -0,0 +1,162 @@ +# Distribution as Linear Algebra + +We adapt an approach of looking at distribution of tensor computation as Linear Algebra operations. + +This allows `ParametricOperators.jl` to offer several high level API in order to perform controlled parallelism as part of your tensor program in the context of machine learning. + +## Kronecker Distribution + +### Distributed Fourier Transform + +Let's consider the example of Fourier Transform as seen in the [Fourier Transform Example](@ref) + +```julia +# Define type and the size of our global tensor +T = Float32 +gx, gy, gz = 10, 20, 30 + +Fx = ParDFT(T, gx) +Fy = ParDFT(Complex{T}, gy) +Fz = ParDFT(Complex{T}, gz) + +F = Fz ⊗ Fy ⊗ Fx +``` + +Assume that our data is partitioned across multiple machine according to the following scheme: + +```julia +partition = [1, 1, 2] +``` + +Each element of `partition` denotes the number of processing elements that divide our input tensor along that dimension. + +For eg. given the above partition and global size, our local tensor would be of size: + +```julia +x = rand(T, 10, 20, 15) +``` + +OR in other terms: + +```julia +localx, localy, localz = [gx, gy, gz] .÷ partition +x = rand(T, localx, localy, localz) +``` + +Now, following the method seen in several recent works (Grady et al., [2022](https://arxiv.org/pdf/2204.01205.pdf)) and [traditional distributed FFTs](https://jipolanco.github.io/PencilFFTs.jl/dev/tutorial/), we can distribute the application of our linearly separable transform across multiple processing elements by simply doing: + +```julia +F = distribute(F, partition) +``` + +Now, to apply the Fourier Transform to our tensor, one can do: + +```julia +F * vec(x) +``` + +Another out-of-box example can be seen at [Distributed FFT of a 3D Tensor](@ref) + +### Distributed Convolution + +!!! note "Definition of Convolution" + Convolution here refers to the application of a linear transform along the channel dimension + +Now, in order to extend this to a convolution layer, lets consider the following partitioned tensor: + +```julia +T = Float32 + +gx, gy, gc = 10, 30, 50 +partition = [2, 2, 1] + +nx, ny, nc = [gx, gy, gc] .÷ partition +x = rand(T, nx, ny, nc) +``` + +Our tensor is sharded across x and y dimensions by 2 processing element along each dimension. + +We can define the operators of our convolution as: + +```julia +Sx = ParIdentity(T, gx) +Sy = ParIdentity(T, gy) +Sc = ParMatrix(T, gc, gc) +``` + +Chain our operators and distribute them: + +```julia +S = Sc ⊗ Sy ⊗ Sx +S = distribute(S, partition) +``` + +Parametrize and apply our transform: + +```julia +θ = init(S) +S(θ) * vec(x) +``` + +Take the gradient of the parameters w.r.t to some objective by simply doing: + +```julia +θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ) +``` + +Another out-of-box example can be seen at [Distributed Parametrized Convolution of a 3D Tensor](@ref) + +## Sharing Weights + +Sharing weights can be thought of as a broadcasting operation. + +In order to share weights of an operator across multiple processing elements, we can do: + +```julia +A = ParMatrix(T, 20, 20) +A = distribute(A) +``` + +Assume the following partition and tensor shape: + +```julia +gc, gx = 20, 100 +partition = [1, 4] + +nc, nx = [gc, gx] .÷ partition +x = rand(T, nc, nx) +``` + +Initialize and apply the matrix operator on the sharded tensor: + +```julia +θ = init(A) +A(θ) * x +``` + +Compute the gradient by doing: + +```julia +θ′ = gradient(θ -> sum(A(θ) * x), θ) +``` + +## Reduction Operation + +In order to perform a reduction operation, more commonly known as an `ALL_REDUCE` operation, we can define: + +```julia +R = ParReduce(T) +``` + +Given any local vector or matrix, we can do: + +```julia +x = rand(T, 100) +R * x +``` + +To compute the gradient of the input w.r.t some objective: + +```julia +x′ = gradient(x -> sum(R * x), x) +``` diff --git a/docs/src/examples.md b/docs/src/examples.md new file mode 100644 index 0000000..7353a2a --- /dev/null +++ b/docs/src/examples.md @@ -0,0 +1,3 @@ +# ParametricOperators.jl + +Documentation for ParametricOperators.jl diff --git a/docs/src/examples/3D_Conv.md b/docs/src/examples/3D_Conv.md new file mode 100644 index 0000000..5bd52d1 --- /dev/null +++ b/docs/src/examples/3D_Conv.md @@ -0,0 +1,44 @@ +### Parametrized Convolution on 3D Tensor + +!!! note "Jump right in" + To get started, you can run some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples) + +Make sure to add necessary dependencies to compute the gradient + +```julia +julia> ] +(v1.9) activate /path/to/your/environment +(env) Zygote ParametricOperators +``` + +```julia +using ParametricOperators +using Zygote + +T = Float32 + +gt, gx, gy = 100, 100, 100 + +# Define a transform along each dimension +St = ParMatrix(T, gt, gt) +Sx = ParMatrix(T, gx, gx) +Sy = ParMatrix(T, gy, gy) + +# Create a Kronecker operator than chains together the transforms +S = Sy ⊗ Sx ⊗ St + +# Parametrize our transform +θ = init(S) |> gpu + +# Apply the transform on a random input +x = rand(T, gt, gx, gy) |> gpu +y = S(θ) * vec(x) + +# Compute the gradient wrt some objective of our parameters +θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ) +``` + +Run the above by doing: +```shell +julia --project=/path/to/your/environment code_above.jl +``` diff --git a/docs/src/examples/3D_DConv.md b/docs/src/examples/3D_DConv.md new file mode 100644 index 0000000..162e394 --- /dev/null +++ b/docs/src/examples/3D_DConv.md @@ -0,0 +1,80 @@ +### Distributed Parametrized Convolution of a 3D Tensor + +!!! note "Jump right in" + To get started, you can run some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples) + +Make sure to add necessary dependencies. You might also need to load a proper MPI implementation based on your hardware. + +```julia +julia> ] +(v1.9) activate /path/to/your/environment +(env) add MPI CUDA ParametricOperators +``` + +!!! warning "To run on multiple GPUs" + If you wish to run on multiple GPUs, make sure the GPUs are binded to different tasks. The approach we use is to unbind our GPUs on request and assign manually: + + ```julia + CUDA.device!(rank % 4) + ``` + + which might be different if you have more or less than 4 GPUs per node. Also, make sure your MPI distribution is functional. +```julia +using ParametricOperators +using CUDA +using MPI +using Zygote + +MPI.Init() + +comm = MPI.COMM_WORLD +rank = MPI.Comm_rank(comm) +size = MPI.Comm_size(comm) + +# Julia requires you to manually assign the gpus, modify to your case. +CUDA.device!(rank % 4) +partition = [1, 1, size] + +T = Float32 + +# Define your Global Size and Data Partition +gt, gx, gy = 100, 100, 100 +nt, nx, ny = [gt, gx, gy] .÷ partition + +# Define a transform along each dimension +St = ParMatrix(T, gt, gt) +Sx = ParMatrix(T, gx, gx) +Sy = ParMatrix(T, gy, gy) + +# Create and distribute the Kronecker operator than chains together the transforms +S = Sy ⊗ Sx ⊗ St +S = distribute(S, partition) + +# Parametrize our transform +θ = init(S) |> gpu + +# Apply the transform on a random input +x = rand(T, nt, nx, ny) |> gpu +y = S(θ) * vec(x) + +# Compute the gradient wrt some objective of our parameters +θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ) + +MPI.Finalize() +``` + +If you have [`mpiexecjl`](https://juliaparallel.org/MPI.jl/stable/usage/#Installation) set up, you can run the above by doing: + +```shell +mpiexecjl --project=/path/to/your/environment -n NTASKS julia code_above.jl +``` + +OR if you have a HPC cluster with [`slurm`](https://slurm.schedmd.com/documentation.html) set up, you can do: + +```shell +salloc --gpus=NTASKS --time=01:00:00 --ntasks=NTASKS --gpus-per-task=1 --gpu-bind=none +srun julia --project=/path/to/your/environment code_above.jl +``` + +!!! warning "Allocation" + Your `salloc` might look different based on your HPC cluster diff --git a/docs/src/examples/3D_DFFT.md b/docs/src/examples/3D_DFFT.md new file mode 100644 index 0000000..56aec80 --- /dev/null +++ b/docs/src/examples/3D_DFFT.md @@ -0,0 +1,73 @@ +### Distributed FFT of a 3D Tensor + +!!! note "Jump right in" + To get started, you can run some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples) + +Make sure to add necessary dependencies. You might also need to load a proper MPI implementation based on your hardware. + +```julia +julia> ] +(v1.9) activate /path/to/your/environment +(env) add MPI CUDA ParametricOperators +``` + +!!! warning "To run on multiple GPUs" + If you wish to run on multiple GPUs, make sure the GPUs are binded to different tasks. The approach we use is to unbind our GPUs on request and assign manually: + + ```julia + CUDA.device!(rank % 4) + ``` + + which might be different if you have more or less than 4 GPUs per node. Also, make sure your MPI distribution is functional. +```julia +using ParametricOperators +using CUDA +using MPI + +MPI.Init() + +comm = MPI.COMM_WORLD +rank = MPI.Comm_rank(comm) +size = MPI.Comm_size(comm) + +# Julia requires you to manually assign the gpus, modify to your case. +CUDA.device!(rank % 4) +partition = [1, 1, size] + +T = Float32 + +# Define your Global Size and Data Partition +gt, gx, gy = 100, 100, 100 +nt, nx, ny = [gt, gx, gy] .÷ partition + +# Define a transform along each dimension +Ft = ParDFT(T, gt) +Fx = ParDFT(Complex{T}, gx) +Fy = ParDFT(Complex{T}, gy) + +# Create and distribute the Kronecker operator than chains together the transforms +F = Fy ⊗ Fx ⊗ Ft +F = distribute(F, partition) + +# Apply the transform on a random input +x = rand(T, nt, nx, ny) |> gpu +y = F * vec(x) + +MPI.Finalize() +``` + +If you have [`mpiexecjl`](https://juliaparallel.org/MPI.jl/stable/usage/#Installation) set up, you can run the above by doing: + +```shell +mpiexecjl --project=/path/to/your/environment -n NTASKS julia code_above.jl +``` + +OR if you have a HPC cluster with [`slurm`](https://slurm.schedmd.com/documentation.html) set up, you can do: + +```shell +salloc --gpus=NTASKS --time=01:00:00 --ntasks=NTASKS --gpus-per-task=1 --gpu-bind=none +srun julia --project=/path/to/your/environment code_above.jl +``` + +!!! warning "Allocation" + Your `salloc` might look different based on your HPC cluster diff --git a/docs/src/examples/3D_FFT.md b/docs/src/examples/3D_FFT.md new file mode 100644 index 0000000..1618005 --- /dev/null +++ b/docs/src/examples/3D_FFT.md @@ -0,0 +1,29 @@ +### FFT of 3D Tensor + +!!! note "Jump right in" + To get started, you can run some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples) + +```julia +using ParametricOperators + +T = Float32 + +gt, gx, gy = 100, 100, 100 + +# Define a transform along each dimension +Ft = ParDFT(T, gt) +Fx = ParDFT(Complex{T}, gx) +Fy = ParDFT(Complex{T}, gy) + +# Create a Kronecker operator than chains together the transforms +F = Fy ⊗ Fx ⊗ Ft + +# Apply the transform on a random input +x = rand(T, gt, gx, gy) |> gpu +y = F * vec(x) +``` + +Run the above by doing: +```shell +julia --project=/path/to/your/environment code_above.jl +``` diff --git a/docs/src/future.md b/docs/src/future.md new file mode 100644 index 0000000..9ae84a9 --- /dev/null +++ b/docs/src/future.md @@ -0,0 +1,8 @@ +### Future Work + +Minimize computation and communication costs over all transformations of the abstract syntax tree such that +- the transformations follow mathematical rules. +- the transformations keep compute and memory bounded to the specification of the system +- the transformations give an optimal distribution pattern at a given point in the computation + +Add tensor-tensor contraction to the abstraction diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..32e48f8 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,44 @@ +### ParametricOperators.jl + +Modern machine learning and scientific computing is increasingly interested in the idea of tensor programs, or programs where the fundamental object is the tensor: a multidimensional array of numbers. + +`ParametricOperators.jl` is an abstraction based on Kronecker products, designed for manipulating large scale tensors. It utilizes lazy operators to facilitate distribution and gradient computation effectively. + +!!! note "Example usage of ParametricOperators.jl" + [`ParametricDFNOs.jl`](https://github.com/slimgroup/ParametericDFNOs.jl) is a library built on top of [`ParametricOperators.jl`](https://github.com/slimgroup/ParametricOperators.jl) that allows for large scale machine learning using Fourier Neural Operators (FNOs) + +Read our paper [here](https://www.youtube.com/watch?v=xvFZjo5PgG0). + +### Authors + +This package is developed and maintained by Felix J. Herrmann's [SlimGroup](https://slim.gatech.edu/) at Georgia Institute of Technology. The main contributors of this package are: + +- [Richard Rex](https://www.linkedin.com/in/richard-rex/) +- Thomas Grady +- Mark Gliens + +### License + +``` +MIT License + +Copyright (c) 2024 SLIM Group @ Georgia Institute of Technology + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md new file mode 100644 index 0000000..7a1601d --- /dev/null +++ b/docs/src/quickstart.md @@ -0,0 +1,150 @@ +## Installation + +Add `ParametricOperators.jl` as a dependency to your environment. + +To add, either do: + +```julia +julia> ] +(v1.9) add ParametricOperators +``` + +OR + +```julia +julia> using Pkg +julia> Pkg.activate("path/to/your/environment") +julia> Pkg.add("ParametricOperators") +``` + +!!! note "Jump right in" + To get started, you can also try running some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples) + +## Simple Operator + +Make sure to include the package in your environment + +```julia +using ParametricOperators +``` + +Lets start by defining a Matrix Operator of size `10x10`: + +```julia +A = ParMatrix(10, 10) +``` + +Now, we parametrize our operator with some weights `θ`: + +```julia +θ = init(A) +``` + +We can now apply our operator on some random input: + +```julia +x = rand(10) +A(θ) * x +``` + +## Gradient Computation + +!!! warning "Limited AD support" + Current support only provided for Zygote.jl + +Make sure to include an AD package in your environment + +```julia +using Zygote +``` + +Using the example above, one can find the gradient of the weights or your input w.r.t to some objective using a standard AD package: + +```julia +# Gradient w.r.t weights +θ′ = gradient(θ -> sum(A(θ) * x), θ) + +# Gradient w.r.t input +x′ = gradient(x -> sum(A(θ) * x), x) +``` + +## Chaining Operators + +We can chain several operators together through multiple ways + +### Compose Operator + +Consider two matrices: + +```julia +L = ParMatrix(10, 4) +R = ParMatrix(10, 4) +``` + +We can now chain and parametrize them by: + +```julia +C = L * R' +θ = init(C) +``` + +This allows us to perform several operations such as + +```julia + +using Zygote +using LinearAlgebra + +x = rand(10) +C(θ) * x +gradient(θ -> norm(C(θ) * x), θ) +``` + +without ever constructing the full matrix `LR'`, a method more popularly referred to as [LR-decomposition](https://link.springer.com/chapter/10.1007/978-3-662-65458-3_11). + +### Kronecker Operator + +[Kronecker Product](https://en.wikipedia.org/wiki/Kronecker_product) is a most commonly used to represent the outer product on 2 matrices. + +We can use this to describe linearly separable transforms that act along different dimensions on a given input tensor. + +For example, consider the following tensor: + +```julia +T = Float32 +x = rand(T, 10, 20, 30) +``` + +We now define the transformation that would act along each dimension. In this case, a [Fourier Transform](https://en.wikipedia.org/wiki/Fourier_transform). + +#### Fourier Transform Example +```julia +Fx = ParDFT(T, 10) +Fy = ParDFT(Complex{T}, 20) +Fz = ParDFT(Complex{T}, 30) +``` + +We can now chain them together using a Kronecker Product: + +```julia +F = Fz ⊗ Fy ⊗ Fx +``` + +Now, we can compute this action on our input by simply doing: + +```julia +F * vec(x) +``` + +!!! tip "This can be extended to any parametrized operators" + For example, in order to apply a linear transform along the y, z dimension while performing no operation along x, one can do: + ```julia + Sx = ParIdentity(T, 10) + Sy = ParMatrix(T, 20, 20) + Sz = ParMatrix(T, 30, 30) + + S = Sz ⊗ Sy ⊗Sx + θ = init(S) + + S(θ) * vec(x) + ``` diff --git a/examples/3D_DConv.jl b/examples/3D_DConv.jl new file mode 100644 index 0000000..80d99f4 --- /dev/null +++ b/examples/3D_DConv.jl @@ -0,0 +1,41 @@ +using ParametricOperators +using CUDA +using MPI +using Zygote + +MPI.Init() + +comm = MPI.COMM_WORLD +rank = MPI.Comm_rank(comm) +size = MPI.Comm_size(comm) + +# Julia requires you to manually assign the gpus, modify to your case. +CUDA.device!(rank % 4) +partition = [1, 1, size] + +T = Float32 + +# Define your Global Size and Data Partition +gt, gx, gy = 100, 100, 100 +nt, nx, ny = [gt, gx, gy] .÷ partition + +# Define a transform along each dimension +St = ParMatrix(T, gt, gt) +Sx = ParMatrix(T, gx, gx) +Sy = ParMatrix(T, gy, gy) + +# Create and distribute the Kronecker operator than chains together the transforms +S = Sy ⊗ Sx ⊗ St +S = distribute(S, partition) + +# Parametrize our transform +θ = init(S) |> gpu + +# Apply the transform on a random input +x = rand(T, nt, nx, ny) |> gpu +y = S(θ) * vec(x) + +# Compute the gradient wrt some objective of our parameters +θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ) + +MPI.Finalize() diff --git a/examples/3D_DFFT.jl b/examples/3D_DFFT.jl new file mode 100644 index 0000000..a9b9913 --- /dev/null +++ b/examples/3D_DFFT.jl @@ -0,0 +1,34 @@ +using ParametricOperators +using CUDA +using MPI + +MPI.Init() + +comm = MPI.COMM_WORLD +rank = MPI.Comm_rank(comm) +size = MPI.Comm_size(comm) + +# Julia requires you to manually assign the gpus, modify to your case. +CUDA.device!(rank % 4) +partition = [1, 1, size] + +T = Float32 + +# Define your Global Size and Data Partition +gt, gx, gy = 100, 100, 100 +nt, nx, ny = [gt, gx, gy] .÷ partition + +# Define a transform along each dimension +Ft = ParDFT(T, gt) +Fx = ParDFT(Complex{T}, gx) +Fy = ParDFT(Complex{T}, gy) + +# Create and distribute the Kronecker operator than chains together the transforms +F = Fy ⊗ Fx ⊗ Ft +F = distribute(F, partition) + +# Apply the transform on a random input +x = rand(T, nt, nx, ny) |> gpu +y = F * vec(x) + +MPI.Finalize() diff --git a/examples/3D_conv.jl b/examples/3D_conv.jl index 5c1a7da..33df147 100644 --- a/examples/3D_conv.jl +++ b/examples/3D_conv.jl @@ -1,43 +1,24 @@ -using Pkg -Pkg.activate("./") - using ParametricOperators using Zygote -using CUDA -using MPI - -MPI.Init() - -comm = MPI.COMM_WORLD -rank = MPI.Comm_rank(comm) -size = MPI.Comm_size(comm) - -CUDA.device!(rank % 4) -partition = [1, 1, size] T = Float32 -# Define your Global Size and Data Partition gt, gx, gy = 100, 100, 100 -nt, nx, ny = [gt, gx, gy] .÷ partition # Define a transform along each dimension St = ParMatrix(T, gt, gt) Sx = ParMatrix(T, gx, gx) Sy = ParMatrix(T, gy, gy) -# Create and distribute the Kronecker operator than chains together the transforms +# Create a Kronecker operator than chains together the transforms S = Sy ⊗ Sx ⊗ St -S = distribute(S, partition) # Parametrize our transform θ = init(S) |> gpu # Apply the transform on a random input -x = rand(T, nt, nx, ny) |> gpu +x = rand(T, gt, gx, gy) |> gpu y = S(θ) * vec(x) # Compute the gradient wrt some objective of our parameters θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ) - -MPI.Finalize() diff --git a/examples/3d_fft.jl b/examples/3d_fft.jl index 4ff2e79..05db16e 100644 --- a/examples/3d_fft.jl +++ b/examples/3d_fft.jl @@ -1,33 +1,17 @@ -using Pkg -Pkg.activate("./") - using ParametricOperators -using CUDA -using MPI - -MPI.Init() - -comm = MPI.COMM_WORLD -rank = MPI.Comm_rank(comm) -size = MPI.Comm_size(comm) - -CUDA.device!(rank % 4) -partition = [1, 1, size] T = Float32 -# Global and Local Partition gt, gx, gy = 100, 100, 100 -nt, nx, ny = [gt, gx, gy] .÷ partition +# Define a transform along each dimension Ft = ParDFT(T, gt) Fx = ParDFT(Complex{T}, gx) Fy = ParDFT(Complex{T}, gy) +# Create a Kronecker operator than chains together the transforms F = Fy ⊗ Fx ⊗ Ft -F = distribute(F, partition) -x = rand(T, nt, nx, ny) |> gpu +# Apply the transform on a random input +x = rand(T, gt, gx, gy) |> gpu y = F * vec(x) - -MPI.Finalize() diff --git a/examples/fno.jl b/examples/fno.jl deleted file mode 100644 index 0455ac8..0000000 --- a/examples/fno.jl +++ /dev/null @@ -1,54 +0,0 @@ -using CUDA -using MAT -using LinearAlgebra -using NNlib -using NNlibCUDA -using ParametricOperators -using Zygote - -function spectral_convolution(T, shape_spacetime, modes_spacetime, lifted_dim) - - # Fourier transform along time/space - dft_time = ParDFT(T, shape_spacetime[end]) - dft_space = ParKron([ParDFT(Complex{T}, s) for s in shape_spacetime[1:end-1]]...) - dft_spacetime = dft_time ⊗ dft_space - - # Restriction to low-frequency modes - restrict = ParKron([ParRestriction(Complex{T}, Range(F), m) for (F, m) in zip(dft_spacetime.ops, reverse(modes_spacetime))]...) - - # Combination dft/restriction along each channel dim - identity_channels = ParIdentity(T, lifted_dim) - dft_restrict = restrict*dft_spacetime - dft_all = dft_restrict ⊗ identity_channels - - # Elementwise multiplication and channel mixing in frequency space - mix_channels = ParMatrix(Complex{T}, lifted_dim, lifted_dim) - elementwise_weighting = ParDiagonal(Complex{T}, Range(dft_restrict)) - frequency_weight = elementwise_weighting ⊗ mix_channels - - # Spectral conv - return dft_all'*frequency_weight*dft_all - -end - -function channel_mixing(T, shape_spacetime, lifted_dim) - mix_channels = ParMatrix(T, lifted_dim, lifted_dim) - identity_spacetime = ParIdentity(T, prod(shape_spacetime)) - return identity_spacetime ⊗ mix_channels -end - -function fno_block(x, spectral_conv, channel_mix) - y1 = spectral_conv(x) - y2 = channel_mix(x) - return gelu.(y1 .+ y2) -end - -shape = [64, 64, 20] -modes = [[1:8, 57:64], [1:8, 57:64], [1:8]] -lifted_dim = 20 - -T = Float64 -S = spectral_convolution(T, shape, modes, lifted_dim) -W = channel_mixing(T, shape, lifted_dim) -θ = init(S) -init!(W, θ) \ No newline at end of file diff --git a/src/ParKron.jl b/src/ParKron.jl index 003b579..b4196d7 100644 --- a/src/ParKron.jl +++ b/src/ParKron.jl @@ -282,7 +282,6 @@ function distribute(A::ParKron, comm_in::MPI.Comm, comm_out::MPI.Comm, parent_co coords_i = MPI.Cart_coords(comm_i) # Create repartition operator - !isequal(dims_prev, dims_i) && (MPI.Comm_rank(parent_comm) == 0) && println("Adding Repartition") !isequal(dims_prev, dims_i) && pushfirst!(ops, ParRepartition(DDT(Ai), comm_prev, comm_i, tuple(size_curr...))) # Create Kronecker w/ distributed identities