diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..5ac8d62
Binary files /dev/null and b/.DS_Store differ
diff --git a/.github/workflows/DocCleanup.yml b/.github/workflows/DocCleanup.yml
new file mode 100644
index 0000000..5be23b9
--- /dev/null
+++ b/.github/workflows/DocCleanup.yml
@@ -0,0 +1,33 @@
+name: Doc Preview Cleanup
+
+on:
+  pull_request:
+    types: [closed]
+
+# Ensure that only one "Doc Preview Cleanup" workflow is force pushing at a time
+concurrency:
+  group: doc-preview-cleanup
+  cancel-in-progress: false
+
+jobs:
+  doc-preview-cleanup:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Checkout gh-pages branch
+        uses: actions/checkout@v4
+        with:
+          ref: gh-pages
+      - name: Delete preview and history + push changes
+        run: |
+          if [ -d "${preview_dir}" ]; then
+              git config user.name "Documenter.jl"
+              git config user.email "documenter@juliadocs.github.io"
+              git rm -rf "${preview_dir}"
+              git commit -m "delete preview"
+              git branch gh-pages-new $(echo "delete history" | git commit-tree HEAD^{tree})
+              git push --force origin gh-pages-new:gh-pages
+          fi
+        env:
+          preview_dir: previews/PR${{ github.event.number }}
diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml
new file mode 100644
index 0000000..b99aad2
--- /dev/null
+++ b/.github/workflows/Documenter.yml
@@ -0,0 +1,22 @@
+name: Documenter
+on:
+  push:
+    branches:
+      - main
+    tags: '*'
+  pull_request:
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: '1.9.4'
+      - name: Install dependencies
+        run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
+      - name: Build and deploy
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+        run: julia --project=docs/ docs/make.jl
diff --git a/.gitignore b/.gitignore
index a65b14a..60f5510 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ examples/.ipynb_checkpoints
 *.svg
 *.png
 *.dot
+!logo.png
diff --git a/LICENSE b/LICENSE
index 5170dc3..1021cbd 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2022 SLIM Group @ Georgia Institute of Technology
+Copyright (c) 2024 SLIM Group @ Georgia Institute of Technology
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/Project.toml b/Project.toml
index 5e516c0..f30fa2e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,23 +1,7 @@
 name = "ParametricOperators"
 uuid = "db9e0614-c73c-4112-a40c-114e5b366d0d"
 authors = ["Richard Rex <richardr2926@gatech.edu>", "Thomas Grady <tgrady@gatech.edu>"]
-version = "1.1.2"
-
-[compat]
-julia = "1.9.4"
-CUDA = "5.2.0"
-ChainRulesCore = "1.23.0"
-Combinatorics = "1.0.2"
-DataStructures = "0.18.18"
-FFTW = "1.8.0"
-Flux = "0.14.15"
-GraphViz = "0.2.0"
-JSON = "0.21.4"
-LRUCache = "1.6.1"
-LaTeXStrings = "1.3.1"
-MPI = "0.20.19"
-Match = "2.0.0"
-OMEinsum = "0.8.1"
+version = "1.1.3"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
@@ -35,3 +19,19 @@ Match = "7eb4fadd-790c-5f42-8a69-bfa0b872bfbf"
 OMEinsum = "ebe7aa44-baf0-506c-a96f-8464559b3922"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[compat]
+CUDA = "5.2.0"
+ChainRulesCore = "1.23.0"
+Combinatorics = "1.0.2"
+DataStructures = "0.18.18"
+FFTW = "1.8.0"
+Flux = "0.14.15"
+GraphViz = "0.2.0"
+JSON = "0.21.4"
+LRUCache = "1.6.1"
+LaTeXStrings = "1.3.1"
+MPI = "0.20.19"
+Match = "2.0.0"
+OMEinsum = "0.8.1"
+julia = "1.9.4"
diff --git a/README.md b/README.md
index 6f902f7..ad12213 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,15 @@
 # ParametricOperators.jl
 
-[![][license-img]][license-status] 
+[![][license-img]][license-status]
+[![Documenter](https://github.com/slimgroup/ParametricOperators.jl/actions/workflows/Documenter.yml/badge.svg)](https://github.com/slimgroup/ParametricOperators.jl/actions/workflows/Documenter.yml)
+[![TagBot](https://github.com/slimgroup/ParametricOperators.jl/actions/workflows/TagBot.yml/badge.svg)](https://github.com/slimgroup/ParametricOperators.jl/actions/workflows/TagBot.yml)
+
 <!-- [![][zenodo-img]][zenodo-status] -->
 
-ParametricOperators.jl is a Julia Language-based scientific library designed to facilitate the creation and manipulation of tensor operations involving large-scale data using Kronecker products. It provides an efficient and mathematically consistent way to express tensor programs and distribution in the context of machine learning.
+`ParametricOperators.jl` is a Julia Language-based scientific library designed to facilitate the creation and manipulation of tensor operations involving large-scale data using Kronecker products. It provides an efficient and mathematically consistent way to express tensor programs and distribution in the context of machine learning.
+
+> [!NOTE]  
+> [`ParametericDFNOs.jl`](https://github.com/slimgroup/ParametericDFNOs.jl/) is built on `ParametricOperators.jl`
 
 ## Features
 - <b>Kronecker Product Operations:</b> Implement tensor operations using Kronecker products for linear operators acting along multiple dimensions.
@@ -16,200 +22,15 @@ ParametricOperators.jl is a Julia Language-based scientific library designed to
 
    ```julia
    julia> using Pkg
-   julia> Pkg.activate("path/to/your/project")
+   julia> Pkg.activate("path/to/your/environment")
    julia> Pkg.add("ParametricOperators")
    ```
 
 This will add `ParametricOperators.jl` as dependency to your project
 
-## Examples
-
-### 1. FFT of 3D Tensor
-
-```julia
-using Pkg
-Pkg.activate("./path/to/your/environment")
-
-using ParametricOperators
-
-T = Float32
-
-gt, gx, gy = 100, 100, 100
-
-# Define a transform along each dimension
-Ft = ParDFT(T, gt)
-Fx = ParDFT(Complex{T}, gx)
-Fy = ParDFT(Complex{T}, gy)
-
-# Create a Kronecker operator than chains together the transforms
-F = Fy ⊗ Fx ⊗ Ft
-
-# Apply the transform on a random input
-x = rand(T, gt, gx, gy) |> gpu
-y = F * vec(x)
-```
-
-### 2. Distributed FFT of a 3D Tensor:
-
-Make sure to add necessary dependencies. You might also need to load a proper MPI implementation based on your hardware.
-
-```julia
-julia> using Pkg
-julia> Pkg.activate("path/to/your/project")
-julia> Pkg.add("MPI")
-julia> Pkg.add("CUDA")
-```
-
-Copy the following code into a `.jl` file
-```julia
-using Pkg
-Pkg.activate("./path/to/your/environment")
-
-using ParametricOperators
-using CUDA
-using MPI
-
-MPI.Init()
-
-comm = MPI.COMM_WORLD
-rank = MPI.Comm_rank(comm)
-size = MPI.Comm_size(comm)
-
-# Julia requires you to manually assign the gpus, modify to your case.
-CUDA.device!(rank % 4)
-partition = [1, 1, size]
-
-T = Float32
-
-# Define your Global Size and Data Partition
-gt, gx, gy = 100, 100, 100
-nt, nx, ny = [gt, gx, gy] .÷ partition
-
-# Define a transform along each dimension
-Ft = ParDFT(T, gt)
-Fx = ParDFT(Complex{T}, gx)
-Fy = ParDFT(Complex{T}, gy)
-
-# Create and distribute the Kronecker operator than chains together the transforms
-F = Fy ⊗ Fx ⊗ Ft
-F = distribute(F, partition)
-
-# Apply the transform on a random input
-x = rand(T, nt, nx, ny) |> gpu
-y = F * vec(x)
-
-MPI.Finalize()
-```
-
-You can run the above by doing:
-
-`srun -n N_TASKS julia code_above.jl`
-
-### 3. Parametrized Convolution on 3D Tensor
-
-Make sure to add necessary dependencies to compute the gradient
-
-```julia
-julia> using Pkg
-julia> Pkg.activate("path/to/your/project")
-julia> Pkg.add("Zygote")
-```
-
-```julia
-using Pkg
-Pkg.activate("./path/to/your/environment")
-
-using ParametricOperators
-using Zygote
-
-T = Float32
-
-gt, gx, gy = 100, 100, 100
-
-# Define a transform along each dimension
-St = ParMatrix(T, gt, gt)
-Sx = ParMatrix(T, gx, gx)
-Sy = ParMatrix(T, gy, gy)
-
-# Create a Kronecker operator than chains together the transforms
-S = Sy ⊗ Sx ⊗ St
-
-# Parametrize our transform
-θ = init(S) |> gpu
-
-# Apply the transform on a random input
-x = rand(T, gt, gx, gy) |> gpu
-y = S(θ) * vec(x)
-
-# Compute the gradient wrt some objective of our parameters
-θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ)
-```
-
-### 4. Distributed Parametrized Convolution of a 3D Tensor:
-
-Make sure to add necessary dependencies. You might also need to load a proper MPI implementation based on your hardware.
-
-```julia
-julia> using Pkg
-julia> Pkg.activate("path/to/your/project")
-julia> Pkg.add("MPI")
-julia> Pkg.add("CUDA")
-julia> Pkg.add("Zygote")
-```
-
-Copy the following code into a `.jl` file
-```julia
-using Pkg
-Pkg.activate("./path/to/your/environment")
-
-using ParametricOperators
-using CUDA
-using MPI
-
-MPI.Init()
-
-comm = MPI.COMM_WORLD
-rank = MPI.Comm_rank(comm)
-size = MPI.Comm_size(comm)
-
-# Julia requires you to manually assign the gpus, modify to your case.
-CUDA.device!(rank % 4)
-partition = [1, 1, size]
-
-T = Float32
-
-# Define your Global Size and Data Partition
-gt, gx, gy = 100, 100, 100
-nt, nx, ny = [gt, gx, gy] .÷ partition
-
-# Define a transform along each dimension
-St = ParMatrix(T, gt, gt)
-Sx = ParMatrix(T, gx, gx)
-Sy = ParMatrix(T, gy, gy)
-
-# Create and distribute the Kronecker operator than chains together the transforms
-S = Sy ⊗ Sx ⊗ St
-S = distribute(S, partition)
-
-# Parametrize our transform
-θ = init(S) |> gpu
-
-# Apply the transform on a random input
-x = rand(T, nt, nx, ny) |> gpu
-y = S(θ) * vec(x)
-
-# Compute the gradient wrt some objective of our parameters
-θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ)
-
-MPI.Finalize()
-```
-
-You can run the above by doing:
-
-`srun -n N_TASKS julia code_above.jl`
-<!-- ## Citation
+## Documentation
 
-If you use our software for your research, we appreciate it if you cite us following the bibtex in [CITATION.bib](CITATION.bib). -->
+Check out the [Documentation](https://slimgroup.github.io/ParametricOperators.jl) for more or get started by running some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples)!
 
 ## Issues
 
diff --git a/docs/.DS_Store b/docs/.DS_Store
new file mode 100644
index 0000000..2d00b61
Binary files /dev/null and b/docs/.DS_Store differ
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..a303fff
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,2 @@
+build/
+site/
diff --git a/docs/Project.toml b/docs/Project.toml
new file mode 100644
index 0000000..1b44072
--- /dev/null
+++ b/docs/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+ParametricOperators = "db9e0614-c73c-4112-a40c-114e5b366d0d"
diff --git a/docs/make.jl b/docs/make.jl
new file mode 100644
index 0000000..ddde293
--- /dev/null
+++ b/docs/make.jl
@@ -0,0 +1,31 @@
+using Documenter
+using ParametricOperators
+
+makedocs(
+    sitename = "ParametricOperators.jl",
+    format = Documenter.HTML(),
+    # modules = [ParametricOperators],
+    pages=[
+        "Introduction" => "index.md",
+        "Quick Start" => "quickstart.md",
+        "Distribution" => "distribution.md",
+        "Examples" => [
+            "3D FFT" => "examples/3D_FFT.md",
+            "Distributed 3D FFT" => "examples/3D_DFFT.md",
+            "3D Conv" => "examples/3D_Conv.md",
+            "Distributed 3D Conv" => "examples/3D_DConv.md",
+        ],
+        "API" => "api.md",
+        "Future Work" => "future.md",
+        "Citation" => "citation.md"
+    ]
+)
+
+# Automatically deploy documentation to gh-pages.
+deploydocs(
+    repo = "github.com/slimgroup/ParametricOperators.jl.git",
+    push_preview=true,
+    devurl = "dev",
+    devbranch = "main",
+    versions = ["dev" => "dev", "stable" => "v^"],
+)
diff --git a/docs/src/.DS_Store b/docs/src/.DS_Store
new file mode 100644
index 0000000..4c6c70d
Binary files /dev/null and b/docs/src/.DS_Store differ
diff --git a/docs/src/api.md b/docs/src/api.md
new file mode 100644
index 0000000..4b3d545
--- /dev/null
+++ b/docs/src/api.md
@@ -0,0 +1,3 @@
+### API usage for different operators
+
+Coming soon...
diff --git a/docs/src/assets/.DS_Store b/docs/src/assets/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/docs/src/assets/.DS_Store differ
diff --git a/docs/src/assets/logo.png b/docs/src/assets/logo.png
new file mode 100644
index 0000000..0b6138a
Binary files /dev/null and b/docs/src/assets/logo.png differ
diff --git a/docs/src/citation.md b/docs/src/citation.md
new file mode 100644
index 0000000..7773336
--- /dev/null
+++ b/docs/src/citation.md
@@ -0,0 +1,12 @@
+### Citation
+
+If you use `ParametricOperators.jl`, please cite the following:
+```
+@presentation {rex2023ML4SEISMIClsp,
+	title = {Large-scale parametric PDE approximations with model-parallel Fourier neural operators},
+	year = {2023},
+	month = {11},
+	url = {https://slim.gatech.edu/Publications/Public/Conferences/ML4SEISMIC/2023/rex2023ML4SEISMIClsp},
+	author = {Richard Rex and Thomas J. Grady II and Rishi Khan and Ziyi Yin and Felix J. Herrmann}
+}
+```
diff --git a/docs/src/distribution.md b/docs/src/distribution.md
new file mode 100644
index 0000000..e222621
--- /dev/null
+++ b/docs/src/distribution.md
@@ -0,0 +1,162 @@
+# Distribution as Linear Algebra
+
+We adapt an approach of looking at distribution of tensor computation as Linear Algebra operations. 
+
+This allows `ParametricOperators.jl` to offer several high level API in order to perform controlled parallelism as part of your tensor program in the context of machine learning.
+
+## Kronecker Distribution
+
+### Distributed Fourier Transform
+
+Let's consider the example of Fourier Transform as seen in the [Fourier Transform Example](@ref) 
+
+```julia
+# Define type and the size of our global tensor
+T = Float32
+gx, gy, gz = 10, 20, 30
+
+Fx = ParDFT(T, gx)
+Fy = ParDFT(Complex{T}, gy)
+Fz = ParDFT(Complex{T}, gz)
+
+F = Fz ⊗ Fy ⊗ Fx
+```
+
+Assume that our data is partitioned across multiple machine according to the following scheme:
+
+```julia
+partition = [1, 1, 2]
+```
+
+Each element of `partition` denotes the number of processing elements that divide our input tensor along that dimension.
+
+For eg. given the above partition and global size, our local tensor would be of size:
+
+```julia
+x = rand(T, 10, 20, 15)
+```
+
+OR in other terms:
+
+```julia
+localx, localy, localz = [gx, gy, gz] .÷ partition
+x = rand(T, localx, localy, localz)
+```
+
+Now, following the method seen in several recent works (Grady et al., [2022](https://arxiv.org/pdf/2204.01205.pdf)) and [traditional distributed FFTs](https://jipolanco.github.io/PencilFFTs.jl/dev/tutorial/), we can distribute the application of our linearly separable transform across multiple processing elements by simply doing:
+
+```julia
+F = distribute(F, partition)
+```
+
+Now, to apply the Fourier Transform to our tensor, one can do:
+
+```julia
+F * vec(x)
+```
+
+Another out-of-box example can be seen at [Distributed FFT of a 3D Tensor](@ref)
+
+### Distributed Convolution
+
+!!! note "Definition of Convolution"
+    Convolution here refers to the application of a linear transform along the channel dimension
+
+Now, in order to extend this to a convolution layer, lets consider the following partitioned tensor:
+
+```julia
+T = Float32
+
+gx, gy, gc = 10, 30, 50
+partition = [2, 2, 1]
+
+nx, ny, nc = [gx, gy, gc] .÷ partition
+x = rand(T, nx, ny, nc)
+```
+
+Our tensor is sharded across x and y dimensions by 2 processing element along each dimension. 
+
+We can define the operators of our convolution as:
+
+```julia
+Sx = ParIdentity(T, gx)
+Sy = ParIdentity(T, gy)
+Sc = ParMatrix(T, gc, gc)
+```
+
+Chain our operators and distribute them:
+
+```julia
+S = Sc ⊗ Sy ⊗ Sx
+S = distribute(S, partition)
+```
+
+Parametrize and apply our transform:
+
+```julia
+θ = init(S)
+S(θ) * vec(x)
+```
+
+Take the gradient of the parameters w.r.t to some objective by simply doing:
+
+```julia
+θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ)
+```
+
+Another out-of-box example can be seen at [Distributed Parametrized Convolution of a 3D Tensor](@ref)
+
+## Sharing Weights
+
+Sharing weights can be thought of as a broadcasting operation.
+
+In order to share weights of an operator across multiple processing elements, we can do:
+
+```julia
+A = ParMatrix(T, 20, 20)
+A = distribute(A)
+```
+
+Assume the following partition and tensor shape:
+
+```julia
+gc, gx = 20, 100
+partition = [1, 4]
+
+nc, nx = [gc, gx] .÷ partition
+x = rand(T, nc, nx)
+```
+
+Initialize and apply the matrix operator on the sharded tensor:
+
+```julia
+θ = init(A)
+A(θ) * x
+```
+
+Compute the gradient by doing:
+
+```julia
+θ′ = gradient(θ -> sum(A(θ) * x), θ)
+```
+
+## Reduction Operation
+
+In order to perform a reduction operation, more commonly known as an `ALL_REDUCE` operation, we can define:
+
+```julia
+R = ParReduce(T)
+```
+
+Given any local vector or matrix, we can do:
+
+```julia
+x = rand(T, 100)
+R * x
+```
+
+To compute the gradient of the input w.r.t some objective:
+
+```julia
+x′ = gradient(x -> sum(R * x), x)
+```
diff --git a/docs/src/examples.md b/docs/src/examples.md
new file mode 100644
index 0000000..7353a2a
--- /dev/null
+++ b/docs/src/examples.md
@@ -0,0 +1,3 @@
+# ParametricOperators.jl
+
+Documentation for ParametricOperators.jl
diff --git a/docs/src/examples/3D_Conv.md b/docs/src/examples/3D_Conv.md
new file mode 100644
index 0000000..5bd52d1
--- /dev/null
+++ b/docs/src/examples/3D_Conv.md
@@ -0,0 +1,44 @@
+### Parametrized Convolution on 3D Tensor
+
+!!! note "Jump right in"
+    To get started, you can run some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples)
+
+Make sure to add necessary dependencies to compute the gradient
+
+```julia
+julia> ]
+(v1.9) activate /path/to/your/environment
+(env) Zygote ParametricOperators
+```
+
+```julia
+using ParametricOperators
+using Zygote
+
+T = Float32
+
+gt, gx, gy = 100, 100, 100
+
+# Define a transform along each dimension
+St = ParMatrix(T, gt, gt)
+Sx = ParMatrix(T, gx, gx)
+Sy = ParMatrix(T, gy, gy)
+
+# Create a Kronecker operator than chains together the transforms
+S = Sy ⊗ Sx ⊗ St
+
+# Parametrize our transform
+θ = init(S) |> gpu
+
+# Apply the transform on a random input
+x = rand(T, gt, gx, gy) |> gpu
+y = S(θ) * vec(x)
+
+# Compute the gradient wrt some objective of our parameters
+θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ)
+```
+
+Run the above by doing:
+```shell
+julia --project=/path/to/your/environment code_above.jl
+```
diff --git a/docs/src/examples/3D_DConv.md b/docs/src/examples/3D_DConv.md
new file mode 100644
index 0000000..162e394
--- /dev/null
+++ b/docs/src/examples/3D_DConv.md
@@ -0,0 +1,80 @@
+### Distributed Parametrized Convolution of a 3D Tensor
+
+!!! note "Jump right in"
+    To get started, you can run some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples)
+
+Make sure to add necessary dependencies. You might also need to load a proper MPI implementation based on your hardware.
+
+```julia
+julia> ]
+(v1.9) activate /path/to/your/environment
+(env) add MPI CUDA ParametricOperators
+```
+
+!!! warning "To run on multiple GPUs"
+    If you wish to run on multiple GPUs, make sure the GPUs are binded to different tasks. The approach we use is to unbind our GPUs on request and assign manually:
+
+    ```julia
+    CUDA.device!(rank % 4)
+    ```
+
+    which might be different if you have more or less than 4 GPUs per node. Also, make sure your MPI distribution is functional.
+```julia
+using ParametricOperators
+using CUDA
+using MPI
+using Zygote
+
+MPI.Init()
+
+comm = MPI.COMM_WORLD
+rank = MPI.Comm_rank(comm)
+size = MPI.Comm_size(comm)
+
+# Julia requires you to manually assign the gpus, modify to your case.
+CUDA.device!(rank % 4)
+partition = [1, 1, size]
+
+T = Float32
+
+# Define your Global Size and Data Partition
+gt, gx, gy = 100, 100, 100
+nt, nx, ny = [gt, gx, gy] .÷ partition
+
+# Define a transform along each dimension
+St = ParMatrix(T, gt, gt)
+Sx = ParMatrix(T, gx, gx)
+Sy = ParMatrix(T, gy, gy)
+
+# Create and distribute the Kronecker operator than chains together the transforms
+S = Sy ⊗ Sx ⊗ St
+S = distribute(S, partition)
+
+# Parametrize our transform
+θ = init(S) |> gpu
+
+# Apply the transform on a random input
+x = rand(T, nt, nx, ny) |> gpu
+y = S(θ) * vec(x)
+
+# Compute the gradient wrt some objective of our parameters
+θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ)
+
+MPI.Finalize()
+```
+
+If you have [`mpiexecjl`](https://juliaparallel.org/MPI.jl/stable/usage/#Installation) set up, you can run the above by doing:
+
+```shell
+mpiexecjl --project=/path/to/your/environment -n NTASKS julia code_above.jl
+```
+
+OR if you have a HPC cluster with [`slurm`](https://slurm.schedmd.com/documentation.html) set up, you can do:
+
+```shell
+salloc --gpus=NTASKS --time=01:00:00 --ntasks=NTASKS --gpus-per-task=1 --gpu-bind=none
+srun julia --project=/path/to/your/environment code_above.jl
+```
+
+!!! warning "Allocation"
+    Your `salloc` might look different based on your HPC cluster
diff --git a/docs/src/examples/3D_DFFT.md b/docs/src/examples/3D_DFFT.md
new file mode 100644
index 0000000..56aec80
--- /dev/null
+++ b/docs/src/examples/3D_DFFT.md
@@ -0,0 +1,73 @@
+### Distributed FFT of a 3D Tensor
+
+!!! note "Jump right in"
+    To get started, you can run some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples)
+
+Make sure to add necessary dependencies. You might also need to load a proper MPI implementation based on your hardware.
+
+```julia
+julia> ]
+(v1.9) activate /path/to/your/environment
+(env) add MPI CUDA ParametricOperators
+```
+
+!!! warning "To run on multiple GPUs"
+    If you wish to run on multiple GPUs, make sure the GPUs are binded to different tasks. The approach we use is to unbind our GPUs on request and assign manually:
+
+    ```julia
+    CUDA.device!(rank % 4)
+    ```
+
+    which might be different if you have more or less than 4 GPUs per node. Also, make sure your MPI distribution is functional.
+```julia
+using ParametricOperators
+using CUDA
+using MPI
+
+MPI.Init()
+
+comm = MPI.COMM_WORLD
+rank = MPI.Comm_rank(comm)
+size = MPI.Comm_size(comm)
+
+# Julia requires you to manually assign the gpus, modify to your case.
+CUDA.device!(rank % 4)
+partition = [1, 1, size]
+
+T = Float32
+
+# Define your Global Size and Data Partition
+gt, gx, gy = 100, 100, 100
+nt, nx, ny = [gt, gx, gy] .÷ partition
+
+# Define a transform along each dimension
+Ft = ParDFT(T, gt)
+Fx = ParDFT(Complex{T}, gx)
+Fy = ParDFT(Complex{T}, gy)
+
+# Create and distribute the Kronecker operator than chains together the transforms
+F = Fy ⊗ Fx ⊗ Ft
+F = distribute(F, partition)
+
+# Apply the transform on a random input
+x = rand(T, nt, nx, ny) |> gpu
+y = F * vec(x)
+
+MPI.Finalize()
+```
+
+If you have [`mpiexecjl`](https://juliaparallel.org/MPI.jl/stable/usage/#Installation) set up, you can run the above by doing:
+
+```shell
+mpiexecjl --project=/path/to/your/environment -n NTASKS julia code_above.jl
+```
+
+OR if you have a HPC cluster with [`slurm`](https://slurm.schedmd.com/documentation.html) set up, you can do:
+
+```shell
+salloc --gpus=NTASKS --time=01:00:00 --ntasks=NTASKS --gpus-per-task=1 --gpu-bind=none
+srun julia --project=/path/to/your/environment code_above.jl
+```
+
+!!! warning "Allocation"
+    Your `salloc` might look different based on your HPC cluster
diff --git a/docs/src/examples/3D_FFT.md b/docs/src/examples/3D_FFT.md
new file mode 100644
index 0000000..1618005
--- /dev/null
+++ b/docs/src/examples/3D_FFT.md
@@ -0,0 +1,29 @@
+### FFT of 3D Tensor
+
+!!! note "Jump right in"
+    To get started, you can run some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples)
+
+```julia
+using ParametricOperators
+
+T = Float32
+
+gt, gx, gy = 100, 100, 100
+
+# Define a transform along each dimension
+Ft = ParDFT(T, gt)
+Fx = ParDFT(Complex{T}, gx)
+Fy = ParDFT(Complex{T}, gy)
+
+# Create a Kronecker operator than chains together the transforms
+F = Fy ⊗ Fx ⊗ Ft
+
+# Apply the transform on a random input
+x = rand(T, gt, gx, gy) |> gpu
+y = F * vec(x)
+```
+
+Run the above by doing:
+```shell
+julia --project=/path/to/your/environment code_above.jl
+```
diff --git a/docs/src/future.md b/docs/src/future.md
new file mode 100644
index 0000000..9ae84a9
--- /dev/null
+++ b/docs/src/future.md
@@ -0,0 +1,8 @@
+### Future Work
+
+Minimize computation and communication costs over all transformations of the abstract syntax tree such that 
+- the transformations follow mathematical rules.
+- the transformations keep compute and memory bounded to the specification of the system
+- the transformations give an optimal distribution pattern at a given point in the computation
+
+Add tensor-tensor contraction to the abstraction
diff --git a/docs/src/index.md b/docs/src/index.md
new file mode 100644
index 0000000..32e48f8
--- /dev/null
+++ b/docs/src/index.md
@@ -0,0 +1,44 @@
+### ParametricOperators.jl
+
+Modern machine learning and scientific computing is increasingly interested in the idea of tensor programs, or programs where the fundamental object is the tensor: a multidimensional array of numbers.
+
+`ParametricOperators.jl` is an abstraction based on Kronecker products, designed for manipulating large scale tensors. It utilizes lazy operators to facilitate distribution and gradient computation effectively.
+
+!!! note "Example usage of ParametricOperators.jl"
+    [`ParametricDFNOs.jl`](https://github.com/slimgroup/ParametericDFNOs.jl) is a library built on top of [`ParametricOperators.jl`](https://github.com/slimgroup/ParametricOperators.jl) that allows for large scale machine learning using Fourier Neural Operators (FNOs)
+
+Read our paper [here](https://www.youtube.com/watch?v=xvFZjo5PgG0).
+
+### Authors
+
+This package is developed and maintained by Felix J. Herrmann's [SlimGroup](https://slim.gatech.edu/) at Georgia Institute of Technology. The main contributors of this package are:
+
+- [Richard Rex](https://www.linkedin.com/in/richard-rex/)
+- Thomas Grady
+- Mark Gliens
+
+### License
+
+```
+MIT License
+
+Copyright (c) 2024 SLIM Group @ Georgia Institute of Technology
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+```
diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
new file mode 100644
index 0000000..7a1601d
--- /dev/null
+++ b/docs/src/quickstart.md
@@ -0,0 +1,150 @@
+## Installation
+
+Add `ParametricOperators.jl` as a dependency to your environment.
+
+To add, either do:
+
+```julia
+julia> ]
+(v1.9) add ParametricOperators
+```
+
+OR
+
+```julia
+julia> using Pkg
+julia> Pkg.activate("path/to/your/environment")
+julia> Pkg.add("ParametricOperators")
+```
+
+!!! note "Jump right in"
+    To get started, you can also try running some [examples](https://github.com/turquoisedragon2926/ParametricOperators.jl-Examples)
+
+## Simple Operator
+
+Make sure to include the package in your environment
+
+```julia
+using ParametricOperators
+```
+
+Lets start by defining a Matrix Operator of size `10x10`:
+
+```julia
+A = ParMatrix(10, 10)
+```
+
+Now, we parametrize our operator with some weights `θ`:
+
+```julia
+θ = init(A)
+```
+
+We can now apply our operator on some random input:
+
+```julia
+x = rand(10)
+A(θ) * x
+```
+
+## Gradient Computation
+
+!!! warning "Limited AD support"
+    Current support only provided for Zygote.jl
+
+Make sure to include an AD package in your environment
+
+```julia
+using Zygote
+```
+
+Using the example above, one can find the gradient of the weights or your input w.r.t to some objective using a standard AD package:
+
+```julia
+# Gradient w.r.t weights
+θ′ = gradient(θ -> sum(A(θ) * x), θ)
+
+# Gradient w.r.t input
+x′ = gradient(x -> sum(A(θ) * x), x)
+```
+
+## Chaining Operators
+
+We can chain several operators together through multiple ways 
+
+### Compose Operator
+
+Consider two matrices:
+
+```julia
+L = ParMatrix(10, 4)
+R = ParMatrix(10, 4)
+```
+
+We can now chain and parametrize them by:
+
+```julia
+C = L * R'
+θ = init(C)
+```
+
+This allows us to perform several operations such as
+
+```julia
+
+using Zygote
+using LinearAlgebra
+
+x = rand(10)
+C(θ) * x
+gradient(θ -> norm(C(θ) * x), θ)
+```
+
+without ever constructing the full matrix `LR'`, a method more popularly referred to as [LR-decomposition](https://link.springer.com/chapter/10.1007/978-3-662-65458-3_11).
+
+### Kronecker Operator
+
+[Kronecker Product](https://en.wikipedia.org/wiki/Kronecker_product) is a most commonly used to represent the outer product on 2 matrices.
+
+We can use this to describe linearly separable transforms that act along different dimensions on a given input tensor.
+
+For example, consider the following tensor:
+
+```julia
+T = Float32
+x = rand(T, 10, 20, 30)
+```
+
+We now define the transformation that would act along each dimension. In this case, a [Fourier Transform](https://en.wikipedia.org/wiki/Fourier_transform).
+
+#### Fourier Transform Example
+```julia
+Fx = ParDFT(T, 10)
+Fy = ParDFT(Complex{T}, 20)
+Fz = ParDFT(Complex{T}, 30)
+```
+
+We can now chain them together using a Kronecker Product:
+
+```julia
+F = Fz ⊗ Fy ⊗ Fx
+```
+
+Now, we can compute this action on our input by simply doing:
+
+```julia
+F * vec(x)
+```
+
+!!! tip "This can be extended to any parametrized operators"
+    For example, in order to apply a linear transform along the y, z dimension while performing no operation along x, one can do:
+    ```julia
+        Sx = ParIdentity(T, 10)
+        Sy = ParMatrix(T, 20, 20)
+        Sz = ParMatrix(T, 30, 30)
+
+        S = Sz ⊗ Sy ⊗Sx
+        θ = init(S)
+
+        S(θ) * vec(x)
+    ```
diff --git a/examples/3D_DConv.jl b/examples/3D_DConv.jl
new file mode 100644
index 0000000..80d99f4
--- /dev/null
+++ b/examples/3D_DConv.jl
@@ -0,0 +1,41 @@
+using ParametricOperators
+using CUDA
+using MPI
+using Zygote
+
+MPI.Init()
+
+comm = MPI.COMM_WORLD
+rank = MPI.Comm_rank(comm)
+size = MPI.Comm_size(comm)
+
+# Julia requires you to manually assign the gpus, modify to your case.
+CUDA.device!(rank % 4)
+partition = [1, 1, size]
+
+T = Float32
+
+# Define your Global Size and Data Partition
+gt, gx, gy = 100, 100, 100
+nt, nx, ny = [gt, gx, gy] .÷ partition
+
+# Define a transform along each dimension
+St = ParMatrix(T, gt, gt)
+Sx = ParMatrix(T, gx, gx)
+Sy = ParMatrix(T, gy, gy)
+
+# Create and distribute the Kronecker operator than chains together the transforms
+S = Sy ⊗ Sx ⊗ St
+S = distribute(S, partition)
+
+# Parametrize our transform
+θ = init(S) |> gpu
+
+# Apply the transform on a random input
+x = rand(T, nt, nx, ny) |> gpu
+y = S(θ) * vec(x)
+
+# Compute the gradient wrt some objective of our parameters
+θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ)
+
+MPI.Finalize()
diff --git a/examples/3D_DFFT.jl b/examples/3D_DFFT.jl
new file mode 100644
index 0000000..a9b9913
--- /dev/null
+++ b/examples/3D_DFFT.jl
@@ -0,0 +1,34 @@
+using ParametricOperators
+using CUDA
+using MPI
+
+MPI.Init()
+
+comm = MPI.COMM_WORLD
+rank = MPI.Comm_rank(comm)
+size = MPI.Comm_size(comm)
+
+# Julia requires you to manually assign the gpus, modify to your case.
+CUDA.device!(rank % 4)
+partition = [1, 1, size]
+
+T = Float32
+
+# Define your Global Size and Data Partition
+gt, gx, gy = 100, 100, 100
+nt, nx, ny = [gt, gx, gy] .÷ partition
+
+# Define a transform along each dimension
+Ft = ParDFT(T, gt)
+Fx = ParDFT(Complex{T}, gx)
+Fy = ParDFT(Complex{T}, gy)
+
+# Create and distribute the Kronecker operator than chains together the transforms
+F = Fy ⊗ Fx ⊗ Ft
+F = distribute(F, partition)
+
+# Apply the transform on a random input
+x = rand(T, nt, nx, ny) |> gpu
+y = F * vec(x)
+
+MPI.Finalize()
diff --git a/examples/3D_conv.jl b/examples/3D_conv.jl
index 5c1a7da..33df147 100644
--- a/examples/3D_conv.jl
+++ b/examples/3D_conv.jl
@@ -1,43 +1,24 @@
-using Pkg
-Pkg.activate("./")
-
 using ParametricOperators
 using Zygote
-using CUDA
-using MPI
-
-MPI.Init()
-
-comm = MPI.COMM_WORLD
-rank = MPI.Comm_rank(comm)
-size = MPI.Comm_size(comm)
-
-CUDA.device!(rank % 4)
-partition = [1, 1, size]
 
 T = Float32
 
-# Define your Global Size and Data Partition
 gt, gx, gy = 100, 100, 100
-nt, nx, ny = [gt, gx, gy] .÷ partition
 
 # Define a transform along each dimension
 St = ParMatrix(T, gt, gt)
 Sx = ParMatrix(T, gx, gx)
 Sy = ParMatrix(T, gy, gy)
 
-# Create and distribute the Kronecker operator than chains together the transforms
+# Create a Kronecker operator than chains together the transforms
 S = Sy ⊗ Sx ⊗ St
-S = distribute(S, partition)
 
 # Parametrize our transform
 θ = init(S) |> gpu
 
 # Apply the transform on a random input
-x = rand(T, nt, nx, ny) |> gpu
+x = rand(T, gt, gx, gy) |> gpu
 y = S(θ) * vec(x)
 
 # Compute the gradient wrt some objective of our parameters
 θ′ = gradient(θ -> sum(S(θ) * vec(x)), θ)
-
-MPI.Finalize()
diff --git a/examples/3d_fft.jl b/examples/3d_fft.jl
index 4ff2e79..05db16e 100644
--- a/examples/3d_fft.jl
+++ b/examples/3d_fft.jl
@@ -1,33 +1,17 @@
-using Pkg
-Pkg.activate("./")
-
 using ParametricOperators
-using CUDA
-using MPI
-
-MPI.Init()
-
-comm = MPI.COMM_WORLD
-rank = MPI.Comm_rank(comm)
-size = MPI.Comm_size(comm)
-
-CUDA.device!(rank % 4)
-partition = [1, 1, size]
 
 T = Float32
 
-# Global and Local Partition
 gt, gx, gy = 100, 100, 100
-nt, nx, ny = [gt, gx, gy] .÷ partition
 
+# Define a transform along each dimension
 Ft = ParDFT(T, gt)
 Fx = ParDFT(Complex{T}, gx)
 Fy = ParDFT(Complex{T}, gy)
 
+# Create a Kronecker operator than chains together the transforms
 F = Fy ⊗ Fx ⊗ Ft
-F = distribute(F, partition)
 
-x = rand(T, nt, nx, ny) |> gpu
+# Apply the transform on a random input
+x = rand(T, gt, gx, gy) |> gpu
 y = F * vec(x)
-
-MPI.Finalize()
diff --git a/examples/fno.jl b/examples/fno.jl
deleted file mode 100644
index 0455ac8..0000000
--- a/examples/fno.jl
+++ /dev/null
@@ -1,54 +0,0 @@
-using CUDA
-using MAT
-using LinearAlgebra
-using NNlib
-using NNlibCUDA
-using ParametricOperators
-using Zygote
-
-function spectral_convolution(T, shape_spacetime, modes_spacetime, lifted_dim)
-
-    # Fourier transform along time/space
-    dft_time = ParDFT(T, shape_spacetime[end])
-    dft_space = ParKron([ParDFT(Complex{T}, s) for s in shape_spacetime[1:end-1]]...)
-    dft_spacetime = dft_time ⊗ dft_space
-
-    # Restriction to low-frequency modes
-    restrict = ParKron([ParRestriction(Complex{T}, Range(F), m) for (F, m) in zip(dft_spacetime.ops, reverse(modes_spacetime))]...)
-
-    # Combination dft/restriction along each channel dim
-    identity_channels = ParIdentity(T, lifted_dim)
-    dft_restrict = restrict*dft_spacetime
-    dft_all = dft_restrict ⊗ identity_channels
-
-    # Elementwise multiplication and channel mixing in frequency space
-    mix_channels = ParMatrix(Complex{T}, lifted_dim, lifted_dim)
-    elementwise_weighting = ParDiagonal(Complex{T}, Range(dft_restrict))
-    frequency_weight = elementwise_weighting ⊗ mix_channels
-
-    # Spectral conv
-    return dft_all'*frequency_weight*dft_all
-
-end
-
-function channel_mixing(T, shape_spacetime, lifted_dim)
-    mix_channels = ParMatrix(T, lifted_dim, lifted_dim)
-    identity_spacetime = ParIdentity(T, prod(shape_spacetime))
-    return identity_spacetime ⊗ mix_channels
-end
-
-function fno_block(x, spectral_conv, channel_mix)
-    y1 = spectral_conv(x)
-    y2 = channel_mix(x)
-    return gelu.(y1 .+ y2)
-end
-
-shape = [64, 64, 20]
-modes = [[1:8, 57:64], [1:8, 57:64], [1:8]]
-lifted_dim = 20
-
-T = Float64
-S = spectral_convolution(T, shape, modes, lifted_dim)
-W = channel_mixing(T, shape, lifted_dim)
-θ = init(S)
-init!(W, θ)
\ No newline at end of file
diff --git a/src/ParKron.jl b/src/ParKron.jl
index 003b579..b4196d7 100644
--- a/src/ParKron.jl
+++ b/src/ParKron.jl
@@ -282,7 +282,6 @@ function distribute(A::ParKron, comm_in::MPI.Comm, comm_out::MPI.Comm, parent_co
         coords_i = MPI.Cart_coords(comm_i)
 
         # Create repartition operator
-        !isequal(dims_prev, dims_i) && (MPI.Comm_rank(parent_comm) == 0) && println("Adding Repartition")
         !isequal(dims_prev, dims_i) && pushfirst!(ops, ParRepartition(DDT(Ai), comm_prev, comm_i, tuple(size_curr...)))
 
         # Create Kronecker w/ distributed identities