From bd8d7a2a452ec8f9afed823299412ac3cce65319 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 23 Feb 2024 16:57:57 +0100 Subject: [PATCH 01/17] spec: Shwap --- specs/src/SUMMARY.md | 2 +- specs/src/WIP.md | 1 - specs/src/shwap/spec.md | 247 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 248 insertions(+), 2 deletions(-) delete mode 100644 specs/src/WIP.md create mode 100644 specs/src/shwap/spec.md diff --git a/specs/src/SUMMARY.md b/specs/src/SUMMARY.md index dd6d59d972..c5e5e92fa9 100644 --- a/specs/src/SUMMARY.md +++ b/specs/src/SUMMARY.md @@ -1,3 +1,3 @@ # Summary -- [WIP](./WIP.md) +- [Shwap](./shwap/spec.md) diff --git a/specs/src/WIP.md b/specs/src/WIP.md deleted file mode 100644 index 85e6ff194b..0000000000 --- a/specs/src/WIP.md +++ /dev/null @@ -1 +0,0 @@ -# WIP diff --git a/specs/src/shwap/spec.md b/specs/src/shwap/spec.md new file mode 100644 index 0000000000..8bea997c31 --- /dev/null +++ b/specs/src/shwap/spec.md @@ -0,0 +1,247 @@ +# Shwap Protocol Specification + +## Terms and Definitions + +The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", +"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and +"OPTIONAL" in this document are to be interpreted as described in BCP +14 [RFC2119] [RFC8174] when, and only when, they appear in all +capitals, as shown here. + +Commonly used terms in this document are described below. + +_**Shwap**_: The protocol described by this document. Shwap is a +portmanteau name of words share and swap. + +_**[Share][shares]**_: The core data structure of DataSquare **"swapped"** between peers. + +_**[DataSquare][square]**_: The DA square format used by Celestia DA network. + +_**[DAH][dah]**_: The Data Availability Header with Row and Column commitments. + +_**[Namespace][ns]**_: The namespace grouping sets of shares. + +_**Peer**_: An entity that can participate in a Shwap protocol. There are three types of peers: +client, server and node. + +_**Client**_: The Peer that requests content by content identifies over Shwap. + +_**Server**_: The Peer that responds with content over Shwap. + +_**Node**_: The peer that namespacesis both the client and the server. + +_**Proof**_: Merkle inclusion proof of the data in the DataSquare. + +## Rationale + +### Multihashes and CID + +Shwap takes inspiration from content addressability, but breaks-free from hash-based only model to optimize message sizes +and data request patterns. In some way, it hacks into multihash abstraction to make it contain data that isn't in fact a +hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of the messages +happens using externally provided data commitment. + +## Protocol Dependencies + +### Bitswap + +Shwap depends on Bitswap for swapping bits in fully distributed p2p-manner. + +## Share Identifiers + +This section defines list of supported share identifiers. Share identifiers defined by Shwap can be used to uniquely +identify any [share container](#share-containers) over a chain with arbitrary number of [DataSquares][square], like a range of +[shares][shares], a row or a [blob][blob]. Every share identifier relates to a respective share container and wise-versa. + +Identifiers are embeddable to narrow down to the needed content. (TODO: Describe better) + +Identifiers MUST have a fixed size for their fields. Subsequently, protobuf can't be used for CID serialization due to +varint usage. Instead, identifiers use simple binary big endian serialization. + +Table of supported identifiers with their respective multihash and codec codes. This table is supposed to be extended +whenever any new identifier is added. + +| Name | Multihash | Codec | +|----------|-----------|--------| +| RowID | 0x7811 | 0x7810 | +| SampleID | 0x7801 | 0x7800 | +| DataID | 0x7821 | 0x7820 | + +### RowID + +RowID identifies the [Row shares container](#row-container) in a [DataSquare][square]. + +RowID identifiers are formatted as shown below: + +``` +RowID { + Height: u64; + RowIndex: u16; +} +``` + +The fields with validity rules that form RowID are: + +**Height**: A uint64 representing the chain height with the data square. It MUST be bigger than zero. + +**RowIndex**: An uint16 representing row index points to a particular row. It MUST not exceed the number of Row roots in +[DAH][dah]. + +Serialized RowID MUST have length of 10 bytes. + +### SampleID + +SampleID identifies a Sample container of a single share in a [DataSquare][square]. + +SampleID identifiers are formatted as shown below: +``` +SampleID { + RowID; + ShareIndex: u16; +} +``` + +The fields with validity rules that form SampleID are: + +[**RowID**](#rowid): A RowID of the sample. It MUST follow [RowID](#rowid) formatting and field validity rules. + +**ShareIndex**: A uint16 representing the index of the sampled share in the row. It MUST not exceed the number of Column +roots in [DAH][dah]. + +Serialized SampleID MUST have length of 12 bytes. + +### DataID + +DataID identifies [namespace][ns] Data container of shares within a _single_ Row. That is, namespace shares spanning +over multiple Rows are identified with multiple identifiers. + +DataID identifiers are formatted as shown below: +``` +DataID { + RowID; + Namespace; +} +``` + +The fields with validity rules that form DataID are: + +[**RowID**](#rowid): A RowID of the namespace data. It MUST follow [RowID](#rowid) formatting and field validity rules. + +[**Namespace**][ns]: A fixed-size bytes array representing the Namespace of interest. It MUST follow [Namespace][ns] +formatting and its validity rules. + +Serialized DataID MUST have length of 39 bytes. + +## Share Containers + +This section defines list of supported share containers. Share containers encapsulate a set of data shares with [DAH][dah] +inclusion proof. Share containers are identified by [share identifiers](#share-identifiers). + +### Row Container + +Row containers encapsulate Row of the [DataSquare][square]. + +Row containers are protobuf formatted using the following proto3 schema: +```protobuf +syntax = "proto3"; + +message Row { + bytes row_id = 1; + repeated bytes row_half = 2; +} +``` + +The fields with validity rules that form Row containers are: + +[**RowID**](#rowid): A RowID of the Row Container. It MUST follow [RowID](#rowid) formatting and field validity rules. + +**RowHalf**: A two-dimensional variable size byte arrays representing left half of shares in the row. It MUST be equal +to the number of Columns roots in [DAH][dah] divided by two. These shares MUST only be from the left half of the row. +The right half is computed using Leopard GF16 Reed-Solomon erasure-coding. Afterward, the [NMT][nmt] is built over both +halves and the computed NMT root MUST be equal to the respective Row root in [DAH][dah]. + +### Sample Container + +Sample containers encapsulate single shares of the [DataSquare][square]. + +Sample containers are protobuf formatted using the following proto3 schema: +```protobuf +syntax = "proto3"; + +message Sample { + bytes sample_id = 1; + bytes sample_share = 2; + Proof sample_proof = 3; + ProofType proof_type = 4; +} + +enum ProofType { + RowProofType = 0; + ColProofType = 1; +} +``` + +The fields with validity rules that form Sample containers are: + +[**SampleID**](#sampleid): A SampleID of the Sample container. It MUST follow [SampleID](#sampleid) formatting and field +validity rules. + +**SampleShare**: A variable size array representing the share contained in the sample. Each share MUST follow [share +formatting and validity][shares-format] rules. + +**Proof**: A [protobuf formated][nmt-pb] [NMT][nmt] proof of share inclusion. It MUST follow [NMT proof verification][nmt-verify] +and be verified against the respective root from Row or Column axis in [DAH][dah]. The axis is defined by ProofType field. + +**ProofType**: An enum defining which root the Proof is coming from. It MUST be either RowProofType or ColumnProofType. + +### Data Container + +Data containers encapsulate user submitted data under [namespaces][ns]. + +Data containers are protobuf formatted using the following proto3 schema: +```protobuf +syntax = "proto3"; + +message Data { + bytes data_id = 1; + repeated bytes data_shares = 2; + Proof data_proof = 3; +} +``` + +The fields with validity rules that form Data containers are: + +[**DataID**](#dataid): A DataID of the Data container. It MUST follow [DataID](#dataid) formatting and field validity +rules. + +**DataShares**: A two-dimensional variable size byte arrays representing left data shares of a namespace in the row. +Each share MUST follow [share formatting and validity][shares-format] rules. + +**Proof**: A [protobuf formated][nmt-pb] [NMT][nmt] proof of share inclusion. It MUST follow [NMT proof verification][nmt-verify] +and be verified against the respective root from Row or Column axis in [DAH][dah]. The axis is defined by ProofType field. + +Namespace data may span over multiple rows in which case all the data is encapsulated in multiple containers. This is +done + +## Protocol Extensions + +This section is a placeholder for future protocol extensions like new new identifiers and containers. + +## Considerations + +### Bitswap CID integration + +The naive question would be: "Why not to make content verification after Bitswap provided it back over its API?" +Intuitively, this would simplify a lot and wouldn't require "hacking" CID. However, this has an important downside - +the Bitswap in such case would consider the content as fetched and valid, sending DONT_WANT message to its peers, while +the message might be invalid according to the verification rules. + +[square]: https://celestiaorg.github.io/celestia-app/specs/data_structures.html#2d-reed-solomon-encoding-scheme +[shares]: https://celestiaorg.github.io/celestia-app/specs/shares.html#abstract +[shares-format]: https://celestiaorg.github.io/celestia-app/specs/shares.html#share-format +[blob]: https://celestiaorg.github.io/celestia-app/specs/data_square_layout.html#blob-share-commitment-rules +[dah]: https://celestiaorg.github.io/celestia-app/specs/data_structures.html#availabledataheader +[ns]: https://celestiaorg.github.io/celestia-app/specs/namespace.html#abstract +[nmt]: https://github.com/celestiaorg/nmt/blob/master/docs/spec/nmt.md +[nmt-pb]: https://github.com/celestiaorg/nmt/blob/f5556676429118db8eeb5fc396a2c75ab12b5f20/pb/proof.proto +[nmt-verify]: https://github.com/celestiaorg/nmt/blob/master/docs/spec/nmt.md#namespace-proof-verification \ No newline at end of file From 5a818beb4a925541ebde1c2ce56bb4e31ac8dd73 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 23 Feb 2024 17:00:31 +0100 Subject: [PATCH 02/17] add abstract and motivatin --- specs/src/shwap/spec.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/specs/src/shwap/spec.md b/specs/src/shwap/spec.md index 8bea997c31..f31a89fcb1 100644 --- a/specs/src/shwap/spec.md +++ b/specs/src/shwap/spec.md @@ -1,5 +1,25 @@ # Shwap Protocol Specification +## Abstract + +This document specifies the Shwap p2p protocol. Shwap provides scalable and extensible framework for exchanging and +swapping of shared data for Celestia's Data Availability network and beyond. + +## Motivation + +The current Data Availability Sampling (DAS) network protocol is inefficient. A _single_ sample operation takes log2(k) network +round-trips(where k is the square size). This is not practical and does not scale for the theoretically unlimited data +square that the Celestia network enables. The main motive here is a protocol with O(1) round-trip for _multiple_ samples, preserving +the assumption of having 1/n honest peers connected. + +Initially, Bitswap and IPLD were adopted as the basis for the DA network protocols, including DAS, +block synchronization (BS), and blob/namespace data retrieval (ND). They gave battle-tested protocols and tooling with +pluggability to rapidly scaffold Celestia's DA network. However, it came with the price of scalability limits and +round-trips resulting in BS slower than block production. Before the network launch, the transition +to the optimized [ShrEx protocol][shrex] for BS and integrating [CAR and DAGStore-based storage][storage] happened +optimizing BS and ND. However, DAS was left untouched, preserving its weak scalability and roundtrip inefficiency. Shwap +addresses these and provides an extensible and flexible framework for BS, ND, and beyond. + ## Terms and Definitions The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", From 7d02fb726edb1d0c540154ffc5f592389d9ae96b Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 1 Mar 2024 16:31:48 +0100 Subject: [PATCH 03/17] streamline spec and add more sections --- specs/src/shwap/spec.md | 283 ++++++++++++++++++++++++++-------------- 1 file changed, 184 insertions(+), 99 deletions(-) diff --git a/specs/src/shwap/spec.md b/specs/src/shwap/spec.md index f31a89fcb1..5325827fd4 100644 --- a/specs/src/shwap/spec.md +++ b/specs/src/shwap/spec.md @@ -2,25 +2,53 @@ ## Abstract -This document specifies the Shwap p2p protocol. Shwap provides scalable and extensible framework for exchanging and -swapping of shared data for Celestia's Data Availability network and beyond. +This document specifies Shwap - the simple and expressive, yet extensible and future-proof messaging framework aiming to +solve critical inefficiencies and standardise messaging of Celestia's Data Availability p2p network. + +Shwap defines messaging framework to be exchanged around the DA p2p network in trust-minimized way and without enforcing +transport(QUIC/TCP or IP) or application layer protocol semantics(e.g HTTP/x). Using this framework, Shwap +declares the most common messages and provides options on how to stack them with lower-level protocols. +Shwap can be stacked together with application protocol like HTTP/x, [KadDHT][kaddht], [Bitswap][bitswap] or any custom +protocol. ## Motivation -The current Data Availability Sampling (DAS) network protocol is inefficient. A _single_ sample operation takes log2(k) network -round-trips(where k is the square size). This is not practical and does not scale for the theoretically unlimited data -square that the Celestia network enables. The main motive here is a protocol with O(1) round-trip for _multiple_ samples, preserving -the assumption of having 1/n honest peers connected. +The current Data Availability Sampling (DAS) network protocol is inefficient. A _single_ sample operation takes log2(k) +network round-trips(where k is the square size). This is not practical and does not scale for the theoretically unlimited +data square that the Celestia network enables. The main motive here is a protocol with O(1) round-trip for _multiple_ +samples, preserving the assumption of having 1/n honest peers connected. Initially, Bitswap and IPLD were adopted as the basis for the DA network protocols, including DAS, block synchronization (BS), and blob/namespace data retrieval (ND). They gave battle-tested protocols and tooling with pluggability to rapidly scaffold Celestia's DA network. However, it came with the price of scalability limits and round-trips resulting in BS slower than block production. Before the network launch, the transition to the optimized [ShrEx protocol][shrex] for BS and integrating [CAR and DAGStore-based storage][storage] happened -optimizing BS and ND. However, DAS was left untouched, preserving its weak scalability and roundtrip inefficiency. Shwap -addresses these and provides an extensible and flexible framework for BS, ND, and beyond. +optimizing BS and ND. However, DAS was left untouched, preserving its weak scalability and roundtrip inefficiency. + +Shwap messaging stacked together with Bitswap protocol directly addresses described inefficiency and provides foundation +for efficient communication for BS, ND, and beyond. + +## Rationale + +The atomic primitive of Celestia's DA network is a share. Shwap standardize messaging and serialization for shares. +Shares are grouped together forming more complex data types(Rows, Blobs, etc). These data types are encapsulated in +containers, e.g. Row container groups shares of a particular row. Containers can be identified with share identifiers +in order to request, advertise or index the containers. The combination of containers and identifiers provides extensible +and expressive messaging framework for groups of shares and enable efficient single round-trip request-response +communication. + +There are many share groups or containers known in Celestia network and systemizing this is the main reason behind setting +up this simple messaging framework. There needs to be a single place with all the possible Celestia DA messages defined +which node software and protocol researchers can rely and coordinate on. Besides, this framework is designed to be +future-proof and sustain changes in the core protocol's data structures and proving system, as long shares stays the +de facto atomic data type. + +Besides, there needs to be systematization and common knowledge-base with all the edge cases for possible protocol +compositions of Shwap with lower-level protocols Bitswap, KadDHT or Shrex, which Shwap aims to describe. + +## Specification -## Terms and Definitions +### Terms and Definitions The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and @@ -48,46 +76,44 @@ _**Client**_: The Peer that requests content by content identifies over Shwap. _**Server**_: The Peer that responds with content over Shwap. -_**Node**_: The peer that namespacesis both the client and the server. +_**Node**_: The peer that is both the client and the server. -_**Proof**_: Merkle inclusion proof of the data in the DataSquare. +_**Proof**_: A Merkle inclusion proof of the data in the DataSquare. -## Rationale +### Message Framework -### Multihashes and CID +This sections defines messaging framework of Shwap. Every group of shares that needs to be exchanged over the network +MUST define its [share identifier](#share-identifiers) and [share container](#share-containers), as well as, follow +their described rules. -Shwap takes inspiration from content addressability, but breaks-free from hash-based only model to optimize message sizes -and data request patterns. In some way, it hacks into multihash abstraction to make it contain data that isn't in fact a -hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of the messages -happens using externally provided data commitment. +#### Share Identifiers -## Protocol Dependencies +Share identifiers defined by Shwap can be used to uniquely identify any [share container](#share-containers) over a chain +with arbitrary number of [DataSquares][square], like a range of [shares][shares], a row or a [blob][blob]. Every share +identifier relates to a respective share container and vise-versa. -### Bitswap +Identifiers MUST have a fixed size for their fields. Subsequently, protobuf SHOULD NOT be used for CID serialization due +to varints and lack of fixed size arrays. Instead, identifiers use simple binary big endian serialization. -Shwap depends on Bitswap for swapping bits in fully distributed p2p-manner. +Identifiers MAY embed each other to narrow down the scope of needed shares. For example, [SampleID](#sampleid) embeds +[RowID](#rowid) as every sample lay on a particular row. -## Share Identifiers +#### Share Containers -This section defines list of supported share identifiers. Share identifiers defined by Shwap can be used to uniquely -identify any [share container](#share-containers) over a chain with arbitrary number of [DataSquares][square], like a range of -[shares][shares], a row or a [blob][blob]. Every share identifier relates to a respective share container and wise-versa. +Share containers encapsulate a set of data shares with [DAH][dah] inclusion proof. Share containers are identified by +[share identifiers](#share-identifiers). -Identifiers are embeddable to narrow down to the needed content. (TODO: Describe better) +#### Versioning -Identifiers MUST have a fixed size for their fields. Subsequently, protobuf can't be used for CID serialization due to -varint usage. Instead, identifiers use simple binary big endian serialization. +In case defined share container or identifier requires an incompatible change the new message type MAY be introduced +suffixed with new major version starting from v1. E.g. if Row message needs a revision, RowV1 is created. -Table of supported identifiers with their respective multihash and codec codes. This table is supposed to be extended -whenever any new identifier is added. +### Messages -| Name | Multihash | Codec | -|----------|-----------|--------| -| RowID | 0x7811 | 0x7810 | -| SampleID | 0x7801 | 0x7800 | -| DataID | 0x7821 | 0x7820 | +This section defines all the supported Shwap messages which includes share identifiers and share containers. All the new +future messages should be described in here. -### RowID +#### RowID RowID identifies the [Row shares container](#row-container) in a [DataSquare][square]. @@ -109,55 +135,7 @@ The fields with validity rules that form RowID are: Serialized RowID MUST have length of 10 bytes. -### SampleID - -SampleID identifies a Sample container of a single share in a [DataSquare][square]. - -SampleID identifiers are formatted as shown below: -``` -SampleID { - RowID; - ShareIndex: u16; -} -``` - -The fields with validity rules that form SampleID are: - -[**RowID**](#rowid): A RowID of the sample. It MUST follow [RowID](#rowid) formatting and field validity rules. - -**ShareIndex**: A uint16 representing the index of the sampled share in the row. It MUST not exceed the number of Column -roots in [DAH][dah]. - -Serialized SampleID MUST have length of 12 bytes. - -### DataID - -DataID identifies [namespace][ns] Data container of shares within a _single_ Row. That is, namespace shares spanning -over multiple Rows are identified with multiple identifiers. - -DataID identifiers are formatted as shown below: -``` -DataID { - RowID; - Namespace; -} -``` - -The fields with validity rules that form DataID are: - -[**RowID**](#rowid): A RowID of the namespace data. It MUST follow [RowID](#rowid) formatting and field validity rules. - -[**Namespace**][ns]: A fixed-size bytes array representing the Namespace of interest. It MUST follow [Namespace][ns] -formatting and its validity rules. - -Serialized DataID MUST have length of 39 bytes. - -## Share Containers - -This section defines list of supported share containers. Share containers encapsulate a set of data shares with [DAH][dah] -inclusion proof. Share containers are identified by [share identifiers](#share-identifiers). - -### Row Container +#### Row Container Row containers encapsulate Row of the [DataSquare][square]. @@ -175,12 +153,33 @@ The fields with validity rules that form Row containers are: [**RowID**](#rowid): A RowID of the Row Container. It MUST follow [RowID](#rowid) formatting and field validity rules. -**RowHalf**: A two-dimensional variable size byte arrays representing left half of shares in the row. It MUST be equal -to the number of Columns roots in [DAH][dah] divided by two. These shares MUST only be from the left half of the row. -The right half is computed using Leopard GF16 Reed-Solomon erasure-coding. Afterward, the [NMT][nmt] is built over both +**RowHalf**: A two-dimensional variable size byte arrays representing left half of shares in the row. It MUST be equal +to the number of Columns roots in [DAH][dah] divided by two. These shares MUST only be from the left half of the row. +The right half is computed using Leopard GF16 Reed-Solomon erasure-coding. Afterward, the [NMT][nmt] is built over both halves and the computed NMT root MUST be equal to the respective Row root in [DAH][dah]. -### Sample Container +#### SampleID + +SampleID identifies a Sample container of a single share in a [DataSquare][square]. + +SampleID identifiers are formatted as shown below: +``` +SampleID { + RowID; + ColumnIndex: u16; +} +``` + +The fields with validity rules that form SampleID are: + +[**RowID**](#rowid): A RowID of the sample. It MUST follow [RowID](#rowid) formatting and field validity rules. + +**ColumnIndex**: A uint16 representing the column index of the sampled share; in other words share index in the row. It +MUST not exceed the number of Column roots in [DAH][dah]. + +Serialized SampleID MUST have length of 12 bytes. + +#### Sample Container Sample containers encapsulate single shares of the [DataSquare][square]. @@ -206,15 +205,37 @@ The fields with validity rules that form Sample containers are: [**SampleID**](#sampleid): A SampleID of the Sample container. It MUST follow [SampleID](#sampleid) formatting and field validity rules. -**SampleShare**: A variable size array representing the share contained in the sample. Each share MUST follow [share +**SampleShare**: A variable size array representing the share contained in the sample. Each share MUST follow [share formatting and validity][shares-format] rules. -**Proof**: A [protobuf formated][nmt-pb] [NMT][nmt] proof of share inclusion. It MUST follow [NMT proof verification][nmt-verify] +**Proof**: A [protobuf formated][nmt-pb] [NMT][nmt] proof of share inclusion. It MUST follow [NMT proof verification][nmt-verify] and be verified against the respective root from Row or Column axis in [DAH][dah]. The axis is defined by ProofType field. -**ProofType**: An enum defining which root the Proof is coming from. It MUST be either RowProofType or ColumnProofType. +**ProofType**: An enum defining which root the Proof is coming from. It MUST be either RowProofType or ColumnProofType. + +#### DataID + +DataID identifies [namespace][ns] Data container of shares within a _single_ Row. That is, namespace shares spanning +over multiple Rows are identified with multiple identifiers. + +DataID identifiers are formatted as shown below: +``` +DataID { + RowID; + Namespace; +} +``` + +The fields with validity rules that form DataID are: + +[**RowID**](#rowid): A RowID of the namespace data. It MUST follow [RowID](#rowid) formatting and field validity rules. + +[**Namespace**][ns]: A fixed-size bytes array representing the Namespace of interest. It MUST follow [Namespace][ns] +formatting and its validity rules. + +Serialized DataID MUST have length of 39 bytes. -### Data Container +#### Data Container Data containers encapsulate user submitted data under [namespaces][ns]. @@ -243,19 +264,83 @@ and be verified against the respective root from Row or Column axis in [DAH][dah Namespace data may span over multiple rows in which case all the data is encapsulated in multiple containers. This is done -## Protocol Extensions +## Protocol Compositions -This section is a placeholder for future protocol extensions like new new identifiers and containers. +This sections specifies compositions of Shwap with other protocols. While Shwap is transport agnostic there are rough +edges on the protocol integration which every composition specifications has to describe. -## Considerations +### Bitswap + +[Bitswap][bitswap] is an application-level protocol designed for sharing verifiable data across peer-to-peer networks. +Bitswap operates as a dynamic want-list exchange among peers in a network. Peers continuously update and share their +want-lists of desired data in real-time. If at least one connected peer has the needed data, it is promptly fetched. +This ongoing exchange ensures that as soon as any peer acquires the sought-after data, it can instantly share it with +those in need. + +Shwap is designed to be synergetic with Bitswap, as that's the primary composition to be deployed in Celestia's DA +network. Bitswap provides 1/N peers guarantee and can parallelize fetching across multiple peers. Both of these properties +greatly contribute to efficient DAS protocol of Celestia. + +Bitswap runs over libp2p stack which provides QUIC transport integration. Subsequently, Shwap will benefit from features +libp2p provides together with transport protocol advancements introduced in QUIC. -### Bitswap CID integration +#### Multihashes and CID + +Bitswap is tightly coupled with Multihash and CID notions establishing the content addressability property. Shwap takes +inspiration from content addressability, but breaks-free from hash-based only model to optimize message sizes +and data request patterns. In some way, it hacks into multihash abstraction to make it contain data that isn't in fact a +hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of the messages +happens using externally provided data commitment. + +However, Bitswap still requires multihashes and CID codecs to be registered. Therefore, we provide a table for the +supported [share identifiers](#share-identifiers) with their respective multihash and CID codec codes. This table +is supposed to be extended whenever any new share identifier is added. + +| Name | Multihash | Codec | +|----------|-----------|--------| +| RowID | 0x7811 | 0x7810 | +| SampleID | 0x7801 | 0x7800 | +| DataID | 0x7821 | 0x7820 | The naive question would be: "Why not to make content verification after Bitswap provided it back over its API?" -Intuitively, this would simplify a lot and wouldn't require "hacking" CID. However, this has an important downside - -the Bitswap in such case would consider the content as fetched and valid, sending DONT_WANT message to its peers, while -the message might be invalid according to the verification rules. +Intuitively, this would simplify a lot and wouldn't require "hacking" CID. However, this has an important downside - +the Bitswap in such case would consider the request finalized and the content as fetched and valid, sending DONT_WANT +message to its peers, while the message might stillbe invalid according to the verification rules. + +## Backwards Compatibility + +Swap is incompatible with the old sampling protocol. + +After rigorous investigation, celestia-node team decided against _implementing_ backward compatibility with +the old protocol into the node client due to the immense complications it brings. Instead, the simple and time-efficient +strategy is transiently deploying infrastructure for old and new versions, allowing network participants to migrate +gradually to the latest version. We will first deprecate the old version, and once the majority has migrated, we will +terminate the old infrastructure. + +## Considerations + +### Security + +Shwap does not change the security model of Celestia's Data Availability network and changes the underlying +protocol for data retrieval. + +Essentially, the network and its codebase get simplified and require less code and infrastructure to operate. This in turn +decreases the amount of implementation vulnerabilities, DOS vectors, message amplification, and resource exhaustion attacks. +Although, new bug may be introduced as with any new protocol. + +### Protobuf Serialization + +Protobuf is widely adopted serialization format and is used within Celestia's protocols. This was quite an obvious choice +for consistency reason, even though we could choose other more efficient and advanced formats like Cap'n Proto. + +## Reference Implementation + +- [Go reference implementation with Bitswap composition][gimpl] +- [Rust implementation with Bitswap composition][rimpl] +[shrex]: https://github.com/celestiaorg/celestia-node/blob/0abd16bbb05bf3016595498844a588ef55c63d2d/docs/adr/adr-013-blocksync-overhaul-part-2.md +[storage]: https://github.com/celestiaorg/celestia-node/blob/a33c80e20da684d656c7213580be7878bcd27cf4/docs/adr/adr-011-blocksync-overhaul-part-1.md +[bitswap]: https://docs.ipfs.tech/concepts/bitswap/ [square]: https://celestiaorg.github.io/celestia-app/specs/data_structures.html#2d-reed-solomon-encoding-scheme [shares]: https://celestiaorg.github.io/celestia-app/specs/shares.html#abstract [shares-format]: https://celestiaorg.github.io/celestia-app/specs/shares.html#share-format From 7ea064b9dfd8865a5ef394bc069bda7469427563 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Sat, 2 Mar 2024 21:45:11 +0100 Subject: [PATCH 04/17] format and lint --- specs/src/shwap/spec.md | 86 ++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/specs/src/shwap/spec.md b/specs/src/shwap/spec.md index 5325827fd4..b41cff1b73 100644 --- a/specs/src/shwap/spec.md +++ b/specs/src/shwap/spec.md @@ -3,19 +3,19 @@ ## Abstract This document specifies Shwap - the simple and expressive, yet extensible and future-proof messaging framework aiming to -solve critical inefficiencies and standardise messaging of Celestia's Data Availability p2p network. +solve critical inefficiencies and standardise messaging of Celestia's Data Availability p2p network. Shwap defines messaging framework to be exchanged around the DA p2p network in trust-minimized way and without enforcing -transport(QUIC/TCP or IP) or application layer protocol semantics(e.g HTTP/x). Using this framework, Shwap -declares the most common messages and provides options on how to stack them with lower-level protocols. -Shwap can be stacked together with application protocol like HTTP/x, [KadDHT][kaddht], [Bitswap][bitswap] or any custom +transport(QUIC/TCP or IP) or application layer protocol semantics(e.g HTTP/x). Using this framework, Shwap +declares the most common messages and provides options on how to stack them with lower-level protocols. +Shwap can be stacked together with application protocol like HTTP/x, [KadDHT][kaddht], [Bitswap][bitswap] or any custom protocol. ## Motivation -The current Data Availability Sampling (DAS) network protocol is inefficient. A _single_ sample operation takes log2(k) -network round-trips(where k is the square size). This is not practical and does not scale for the theoretically unlimited -data square that the Celestia network enables. The main motive here is a protocol with O(1) round-trip for _multiple_ +The current Data Availability Sampling (DAS) network protocol is inefficient. A _single_ sample operation takes log2(k) +network round-trips(where k is the square size). This is not practical and does not scale for the theoretically unlimited +data square that the Celestia network enables. The main motive here is a protocol with O(1) round-trip for _multiple_ samples, preserving the assumption of having 1/n honest peers connected. Initially, Bitswap and IPLD were adopted as the basis for the DA network protocols, including DAS, @@ -23,7 +23,7 @@ block synchronization (BS), and blob/namespace data retrieval (ND). They gave ba pluggability to rapidly scaffold Celestia's DA network. However, it came with the price of scalability limits and round-trips resulting in BS slower than block production. Before the network launch, the transition to the optimized [ShrEx protocol][shrex] for BS and integrating [CAR and DAGStore-based storage][storage] happened -optimizing BS and ND. However, DAS was left untouched, preserving its weak scalability and roundtrip inefficiency. +optimizing BS and ND. However, DAS was left untouched, preserving its weak scalability and roundtrip inefficiency. Shwap messaging stacked together with Bitswap protocol directly addresses described inefficiency and provides foundation for efficient communication for BS, ND, and beyond. @@ -34,16 +34,16 @@ The atomic primitive of Celestia's DA network is a share. Shwap standardize mess Shares are grouped together forming more complex data types(Rows, Blobs, etc). These data types are encapsulated in containers, e.g. Row container groups shares of a particular row. Containers can be identified with share identifiers in order to request, advertise or index the containers. The combination of containers and identifiers provides extensible -and expressive messaging framework for groups of shares and enable efficient single round-trip request-response +and expressive messaging framework for groups of shares and enable efficient single round-trip request-response communication. There are many share groups or containers known in Celestia network and systemizing this is the main reason behind setting up this simple messaging framework. There needs to be a single place with all the possible Celestia DA messages defined -which node software and protocol researchers can rely and coordinate on. Besides, this framework is designed to be -future-proof and sustain changes in the core protocol's data structures and proving system, as long shares stays the +which node software and protocol researchers can rely and coordinate on. Besides, this framework is designed to be +future-proof and sustain changes in the core protocol's data structures and proving system, as long shares stays the de facto atomic data type. -Besides, there needs to be systematization and common knowledge-base with all the edge cases for possible protocol +Besides, there needs to be systematization and common knowledge-base with all the edge cases for possible protocol compositions of Shwap with lower-level protocols Bitswap, KadDHT or Shrex, which Shwap aims to describe. ## Specification @@ -82,14 +82,14 @@ _**Proof**_: A Merkle inclusion proof of the data in the DataSquare. ### Message Framework -This sections defines messaging framework of Shwap. Every group of shares that needs to be exchanged over the network -MUST define its [share identifier](#share-identifiers) and [share container](#share-containers), as well as, follow +This sections defines messaging framework of Shwap. Every group of shares that needs to be exchanged over the network +MUST define its [share identifier](#share-identifiers) and [share container](#share-containers), as well as, follow their described rules. #### Share Identifiers Share identifiers defined by Shwap can be used to uniquely identify any [share container](#share-containers) over a chain -with arbitrary number of [DataSquares][square], like a range of [shares][shares], a row or a [blob][blob]. Every share +with arbitrary number of [DataSquares][square], like a range of [shares][shares], a row or a [blob][blob]. Every share identifier relates to a respective share container and vise-versa. Identifiers MUST have a fixed size for their fields. Subsequently, protobuf SHOULD NOT be used for CID serialization due @@ -100,12 +100,12 @@ Identifiers MAY embed each other to narrow down the scope of needed shares. For #### Share Containers -Share containers encapsulate a set of data shares with [DAH][dah] inclusion proof. Share containers are identified by +Share containers encapsulate a set of data shares with [DAH][dah] inclusion proof. Share containers are identified by [share identifiers](#share-identifiers). #### Versioning -In case defined share container or identifier requires an incompatible change the new message type MAY be introduced +In case defined share container or identifier requires an incompatible change the new message type MAY be introduced suffixed with new major version starting from v1. E.g. if Row message needs a revision, RowV1 is created. ### Messages @@ -119,7 +119,7 @@ RowID identifies the [Row shares container](#row-container) in a [DataSquare][sq RowID identifiers are formatted as shown below: -``` +```text RowID { Height: u64; RowIndex: u16; @@ -128,7 +128,7 @@ RowID { The fields with validity rules that form RowID are: -**Height**: A uint64 representing the chain height with the data square. It MUST be bigger than zero. +**Height**: A uint64 representing the chain height with the data square. It MUST be bigger than zero. **RowIndex**: An uint16 representing row index points to a particular row. It MUST not exceed the number of Row roots in [DAH][dah]. @@ -140,6 +140,7 @@ Serialized RowID MUST have length of 10 bytes. Row containers encapsulate Row of the [DataSquare][square]. Row containers are protobuf formatted using the following proto3 schema: + ```protobuf syntax = "proto3"; @@ -163,7 +164,8 @@ halves and the computed NMT root MUST be equal to the respective Row root in [DA SampleID identifies a Sample container of a single share in a [DataSquare][square]. SampleID identifiers are formatted as shown below: -``` + +```text SampleID { RowID; ColumnIndex: u16; @@ -174,7 +176,7 @@ The fields with validity rules that form SampleID are: [**RowID**](#rowid): A RowID of the sample. It MUST follow [RowID](#rowid) formatting and field validity rules. -**ColumnIndex**: A uint16 representing the column index of the sampled share; in other words share index in the row. It +**ColumnIndex**: A uint16 representing the column index of the sampled share; in other words share index in the row. It MUST not exceed the number of Column roots in [DAH][dah]. Serialized SampleID MUST have length of 12 bytes. @@ -184,6 +186,7 @@ Serialized SampleID MUST have length of 12 bytes. Sample containers encapsulate single shares of the [DataSquare][square]. Sample containers are protobuf formatted using the following proto3 schema: + ```protobuf syntax = "proto3"; @@ -215,11 +218,12 @@ and be verified against the respective root from Row or Column axis in [DAH][dah #### DataID -DataID identifies [namespace][ns] Data container of shares within a _single_ Row. That is, namespace shares spanning +DataID identifies [namespace][ns] Data container of shares within a _single_ Row. That is, namespace shares spanning over multiple Rows are identified with multiple identifiers. DataID identifiers are formatted as shown below: -``` + +```text DataID { RowID; Namespace; @@ -230,7 +234,7 @@ The fields with validity rules that form DataID are: [**RowID**](#rowid): A RowID of the namespace data. It MUST follow [RowID](#rowid) formatting and field validity rules. -[**Namespace**][ns]: A fixed-size bytes array representing the Namespace of interest. It MUST follow [Namespace][ns] +[**Namespace**][ns]: A fixed-size bytes array representing the Namespace of interest. It MUST follow [Namespace][ns] formatting and its validity rules. Serialized DataID MUST have length of 39 bytes. @@ -240,6 +244,7 @@ Serialized DataID MUST have length of 39 bytes. Data containers encapsulate user submitted data under [namespaces][ns]. Data containers are protobuf formatted using the following proto3 schema: + ```protobuf syntax = "proto3"; @@ -252,32 +257,32 @@ message Data { The fields with validity rules that form Data containers are: -[**DataID**](#dataid): A DataID of the Data container. It MUST follow [DataID](#dataid) formatting and field validity +[**DataID**](#dataid): A DataID of the Data container. It MUST follow [DataID](#dataid) formatting and field validity rules. -**DataShares**: A two-dimensional variable size byte arrays representing left data shares of a namespace in the row. +**DataShares**: A two-dimensional variable size byte arrays representing left data shares of a namespace in the row. Each share MUST follow [share formatting and validity][shares-format] rules. **Proof**: A [protobuf formated][nmt-pb] [NMT][nmt] proof of share inclusion. It MUST follow [NMT proof verification][nmt-verify] and be verified against the respective root from Row or Column axis in [DAH][dah]. The axis is defined by ProofType field. -Namespace data may span over multiple rows in which case all the data is encapsulated in multiple containers. This is +Namespace data may span over multiple rows in which case all the data is encapsulated in multiple containers. This is done ## Protocol Compositions -This sections specifies compositions of Shwap with other protocols. While Shwap is transport agnostic there are rough +This sections specifies compositions of Shwap with other protocols. While Shwap is transport agnostic there are rough edges on the protocol integration which every composition specifications has to describe. ### Bitswap -[Bitswap][bitswap] is an application-level protocol designed for sharing verifiable data across peer-to-peer networks. -Bitswap operates as a dynamic want-list exchange among peers in a network. Peers continuously update and share their -want-lists of desired data in real-time. If at least one connected peer has the needed data, it is promptly fetched. -This ongoing exchange ensures that as soon as any peer acquires the sought-after data, it can instantly share it with +[Bitswap][bitswap] is an application-level protocol designed for sharing verifiable data across peer-to-peer networks. +Bitswap operates as a dynamic want-list exchange among peers in a network. Peers continuously update and share their +want-lists of desired data in real-time. If at least one connected peer has the needed data, it is promptly fetched. +This ongoing exchange ensures that as soon as any peer acquires the sought-after data, it can instantly share it with those in need. -Shwap is designed to be synergetic with Bitswap, as that's the primary composition to be deployed in Celestia's DA +Shwap is designed to be synergetic with Bitswap, as that's the primary composition to be deployed in Celestia's DA network. Bitswap provides 1/N peers guarantee and can parallelize fetching across multiple peers. Both of these properties greatly contribute to efficient DAS protocol of Celestia. @@ -286,14 +291,14 @@ libp2p provides together with transport protocol advancements introduced in QUIC #### Multihashes and CID -Bitswap is tightly coupled with Multihash and CID notions establishing the content addressability property. Shwap takes +Bitswap is tightly coupled with Multihash and CID notions establishing the content addressability property. Shwap takes inspiration from content addressability, but breaks-free from hash-based only model to optimize message sizes and data request patterns. In some way, it hacks into multihash abstraction to make it contain data that isn't in fact a hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of the messages happens using externally provided data commitment. -However, Bitswap still requires multihashes and CID codecs to be registered. Therefore, we provide a table for the -supported [share identifiers](#share-identifiers) with their respective multihash and CID codec codes. This table +However, Bitswap still requires multihashes and CID codecs to be registered. Therefore, we provide a table for the +supported [share identifiers](#share-identifiers) with their respective multihash and CID codec codes. This table is supposed to be extended whenever any new share identifier is added. | Name | Multihash | Codec | @@ -304,8 +309,8 @@ is supposed to be extended whenever any new share identifier is added. The naive question would be: "Why not to make content verification after Bitswap provided it back over its API?" Intuitively, this would simplify a lot and wouldn't require "hacking" CID. However, this has an important downside - -the Bitswap in such case would consider the request finalized and the content as fetched and valid, sending DONT_WANT -message to its peers, while the message might stillbe invalid according to the verification rules. +the Bitswap in such case would consider the request finalized and the content as fetched and valid, sending DONT_WANT +message to its peers, while the message might still be invalid according to the verification rules. ## Backwards Compatibility @@ -341,6 +346,7 @@ for consistency reason, even though we could choose other more efficient and adv [shrex]: https://github.com/celestiaorg/celestia-node/blob/0abd16bbb05bf3016595498844a588ef55c63d2d/docs/adr/adr-013-blocksync-overhaul-part-2.md [storage]: https://github.com/celestiaorg/celestia-node/blob/a33c80e20da684d656c7213580be7878bcd27cf4/docs/adr/adr-011-blocksync-overhaul-part-1.md [bitswap]: https://docs.ipfs.tech/concepts/bitswap/ +[kaddht]: https://pdos.csail.mit.edu/~petar/papers/maymounkov-kademlia-lncs.pdf [square]: https://celestiaorg.github.io/celestia-app/specs/data_structures.html#2d-reed-solomon-encoding-scheme [shares]: https://celestiaorg.github.io/celestia-app/specs/shares.html#abstract [shares-format]: https://celestiaorg.github.io/celestia-app/specs/shares.html#share-format @@ -349,4 +355,6 @@ for consistency reason, even though we could choose other more efficient and adv [ns]: https://celestiaorg.github.io/celestia-app/specs/namespace.html#abstract [nmt]: https://github.com/celestiaorg/nmt/blob/master/docs/spec/nmt.md [nmt-pb]: https://github.com/celestiaorg/nmt/blob/f5556676429118db8eeb5fc396a2c75ab12b5f20/pb/proof.proto -[nmt-verify]: https://github.com/celestiaorg/nmt/blob/master/docs/spec/nmt.md#namespace-proof-verification \ No newline at end of file +[nmt-verify]: https://github.com/celestiaorg/nmt/blob/master/docs/spec/nmt.md#namespace-proof-verification +[gimpl]: https://github.com/celestiaorg/celestia-node/pull/2675 +[rimpl]: https://github.com/eigerco/lumina/blob/561640072114fa5c4ed807e94882473476a41dda/node/src/p2p/shwap.rs From 0644ea7b4462e6eeac287be0c7819827ed6a81e7 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 17:00:57 +0100 Subject: [PATCH 05/17] update bitswap section --- specs/src/shwap/spec.md | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/specs/src/shwap/spec.md b/specs/src/shwap/spec.md index b41cff1b73..f3f8293152 100644 --- a/specs/src/shwap/spec.md +++ b/specs/src/shwap/spec.md @@ -291,11 +291,20 @@ libp2p provides together with transport protocol advancements introduced in QUIC #### Multihashes and CID -Bitswap is tightly coupled with Multihash and CID notions establishing the content addressability property. Shwap takes -inspiration from content addressability, but breaks-free from hash-based only model to optimize message sizes -and data request patterns. In some way, it hacks into multihash abstraction to make it contain data that isn't in fact a -hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of the messages -happens using externally provided data commitment. +Bitswap is tightly coupled with Multihash and CID notions establishing the [content addressability property][content-address]. +Bitswap operates over Blocks of data which are addressed and verified by CIDs. Basing on that, Shwap integrates into +Bitswap by complying to both of these interfaces. The [Share Containers](#share-containers) are Blocks which identified +via [Share Identifiers](#share-identifiers). + +Even though Shwap takes inspiration from content addressability, it breaks-free from hash-based only model to optimize +message sizes and data request patterns. In some way, it hacks into multihash abstraction to make it contain data that +isn't in fact a hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of +the messages happens using externally provided data commitment. + +The naive question would be: "Why not to make content verification after Bitswap provided it back over its API?" +Intuitively, this would simplify a lot and wouldn't require "hacking" CID. However, this has an important downside - +the Bitswap in such case would consider the request finalized and the content as fetched and valid, sending DONT_WANT +message to its peers, while the message might still be invalid according to the verification rules. However, Bitswap still requires multihashes and CID codecs to be registered. Therefore, we provide a table for the supported [share identifiers](#share-identifiers) with their respective multihash and CID codec codes. This table @@ -307,11 +316,6 @@ is supposed to be extended whenever any new share identifier is added. | SampleID | 0x7801 | 0x7800 | | DataID | 0x7821 | 0x7820 | -The naive question would be: "Why not to make content verification after Bitswap provided it back over its API?" -Intuitively, this would simplify a lot and wouldn't require "hacking" CID. However, this has an important downside - -the Bitswap in such case would consider the request finalized and the content as fetched and valid, sending DONT_WANT -message to its peers, while the message might still be invalid according to the verification rules. - ## Backwards Compatibility Swap is incompatible with the old sampling protocol. @@ -346,6 +350,7 @@ for consistency reason, even though we could choose other more efficient and adv [shrex]: https://github.com/celestiaorg/celestia-node/blob/0abd16bbb05bf3016595498844a588ef55c63d2d/docs/adr/adr-013-blocksync-overhaul-part-2.md [storage]: https://github.com/celestiaorg/celestia-node/blob/a33c80e20da684d656c7213580be7878bcd27cf4/docs/adr/adr-011-blocksync-overhaul-part-1.md [bitswap]: https://docs.ipfs.tech/concepts/bitswap/ +[content-address]: https://fission.codes/blog/content-addressing-what-it-is-and-how-it-works/ [kaddht]: https://pdos.csail.mit.edu/~petar/papers/maymounkov-kademlia-lncs.pdf [square]: https://celestiaorg.github.io/celestia-app/specs/data_structures.html#2d-reed-solomon-encoding-scheme [shares]: https://celestiaorg.github.io/celestia-app/specs/shares.html#abstract From dc3725e24d2c50e460e197e62568e40aa07e1721 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 17:58:15 +0100 Subject: [PATCH 06/17] update bitswap section --- specs/src/shwap/spec.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/specs/src/shwap/spec.md b/specs/src/shwap/spec.md index f3f8293152..19b9bc13ca 100644 --- a/specs/src/shwap/spec.md +++ b/specs/src/shwap/spec.md @@ -301,10 +301,12 @@ message sizes and data request patterns. In some way, it hacks into multihash ab isn't in fact a hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of the messages happens using externally provided data commitment. -The naive question would be: "Why not to make content verification after Bitswap provided it back over its API?" -Intuitively, this would simplify a lot and wouldn't require "hacking" CID. However, this has an important downside - -the Bitswap in such case would consider the request finalized and the content as fetched and valid, sending DONT_WANT -message to its peers, while the message might still be invalid according to the verification rules. +This in turn creates a bunch complexities with the [reference Golang implementation][gimpl] that are necessary if forking +and diverging the upstream substantially is not an option. The naive question would be: "Why not to make content +verification after Bitswap provided it back over its API?" Intuitively, this would simplify a lot and wouldn't require +"hacking" CID. However, this has an important downside - the Bitswap in such case would consider the request finalized +and the content as fetched and valid, sending DONT_WANT message to its peers, while the message might still be invalid +according to the verification rules. However, Bitswap still requires multihashes and CID codecs to be registered. Therefore, we provide a table for the supported [share identifiers](#share-identifiers) with their respective multihash and CID codec codes. This table From 03ae4b8bdb767c44f186ff740c4f08301b9f0cf4 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 18:36:10 +0100 Subject: [PATCH 07/17] grammar fixes --- specs/src/shwap/spec.md | 130 +++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 69 deletions(-) diff --git a/specs/src/shwap/spec.md b/specs/src/shwap/spec.md index 19b9bc13ca..6b2e93c3e5 100644 --- a/specs/src/shwap/spec.md +++ b/specs/src/shwap/spec.md @@ -2,49 +2,49 @@ ## Abstract -This document specifies Shwap - the simple and expressive, yet extensible and future-proof messaging framework aiming to -solve critical inefficiencies and standardise messaging of Celestia's Data Availability p2p network. +This document specifies Shwap - the simple and expressive yet extensible and future-proof messaging framework aiming to +solve critical inefficiencies and standardize messaging of Celestia's Data Availability p2p network. -Shwap defines messaging framework to be exchanged around the DA p2p network in trust-minimized way and without enforcing -transport(QUIC/TCP or IP) or application layer protocol semantics(e.g HTTP/x). Using this framework, Shwap -declares the most common messages and provides options on how to stack them with lower-level protocols. +Shwap defines a messaging framework to be exchanged around the DA p2p network in a trust-minimized way without enforcing +transport(QUIC/TCP or IP) or application layer protocol semantics(e.g., HTTP/x). Using this framework, Shwap +declares the most common messages and provides options for stacking them with lower-level protocols. Shwap can be stacked together with application protocol like HTTP/x, [KadDHT][kaddht], [Bitswap][bitswap] or any custom protocol. ## Motivation The current Data Availability Sampling (DAS) network protocol is inefficient. A _single_ sample operation takes log2(k) -network round-trips(where k is the square size). This is not practical and does not scale for the theoretically unlimited -data square that the Celestia network enables. The main motive here is a protocol with O(1) round-trip for _multiple_ +network roundtrips (where k is the square size). This is not practical and does not scale for the theoretically unlimited +data square that the Celestia network enables. The main motive here is a protocol with O(1) roundtrip for _multiple_ samples, preserving the assumption of having 1/n honest peers connected. Initially, Bitswap and IPLD were adopted as the basis for the DA network protocols, including DAS, block synchronization (BS), and blob/namespace data retrieval (ND). They gave battle-tested protocols and tooling with pluggability to rapidly scaffold Celestia's DA network. However, it came with the price of scalability limits and -round-trips resulting in BS slower than block production. Before the network launch, the transition +roundtrips, resulting in slower BS than block production. Before the network launch, the transition to the optimized [ShrEx protocol][shrex] for BS and integrating [CAR and DAGStore-based storage][storage] happened optimizing BS and ND. However, DAS was left untouched, preserving its weak scalability and roundtrip inefficiency. -Shwap messaging stacked together with Bitswap protocol directly addresses described inefficiency and provides foundation +Shwap messaging stacked together with Bitswap protocol directly addresses described inefficiency and provides a foundation for efficient communication for BS, ND, and beyond. ## Rationale -The atomic primitive of Celestia's DA network is a share. Shwap standardize messaging and serialization for shares. -Shares are grouped together forming more complex data types(Rows, Blobs, etc). These data types are encapsulated in -containers, e.g. Row container groups shares of a particular row. Containers can be identified with share identifiers -in order to request, advertise or index the containers. The combination of containers and identifiers provides extensible -and expressive messaging framework for groups of shares and enable efficient single round-trip request-response +The atomic primitive of Celestia's DA network is the share. Shwap standardizes messaging and serialization for shares. +Shares are grouped together, forming more complex data types(Rows, Blobs, etc.). These data types are encapsulated in +containers, e.g., Row container groups shares of a particular row. Containers can be identified with the share identifiers +in order to request, advertise or index the containers. The combination of containers and identifiers provides an extensible +and expressive messaging framework for groups of shares and enables efficient single roundtrip request-response communication. -There are many share groups or containers known in Celestia network and systemizing this is the main reason behind setting -up this simple messaging framework. There needs to be a single place with all the possible Celestia DA messages defined -which node software and protocol researchers can rely and coordinate on. Besides, this framework is designed to be -future-proof and sustain changes in the core protocol's data structures and proving system, as long shares stays the +Many share groups or containers are known in the Celestia network, and systemizing this is the main reason behind setting +up this simple messaging framework. A single place with all the possible Celestia DA messages must be defined, which node +software and protocol researchers can rely on and coordinate. Besides, this framework is designed to be +future-proof and sustain changes in the core protocol's data structures and proving system as long shares stay the de facto atomic data type. -Besides, there needs to be systematization and common knowledge-base with all the edge cases for possible protocol -compositions of Shwap with lower-level protocols Bitswap, KadDHT or Shrex, which Shwap aims to describe. +Besides, there needs to be systematization and a joint knowledge base with all the edge cases for possible protocol +compositions of Shwap with lower-level protocols Bitswap, KadDHT, or Shrex, which Shwap aims to describe. ## Specification @@ -70,7 +70,7 @@ _**[DAH][dah]**_: The Data Availability Header with Row and Column commitments. _**[Namespace][ns]**_: The namespace grouping sets of shares. _**Peer**_: An entity that can participate in a Shwap protocol. There are three types of peers: -client, server and node. +client, server, and node. _**Client**_: The Peer that requests content by content identifies over Shwap. @@ -82,16 +82,12 @@ _**Proof**_: A Merkle inclusion proof of the data in the DataSquare. ### Message Framework -This sections defines messaging framework of Shwap. Every group of shares that needs to be exchanged over the network +This section defines Shwap's messaging framework. Every group of shares that needs to be exchanged over the network MUST define its [share identifier](#share-identifiers) and [share container](#share-containers), as well as, follow their described rules. #### Share Identifiers -Share identifiers defined by Shwap can be used to uniquely identify any [share container](#share-containers) over a chain -with arbitrary number of [DataSquares][square], like a range of [shares][shares], a row or a [blob][blob]. Every share -identifier relates to a respective share container and vise-versa. - Identifiers MUST have a fixed size for their fields. Subsequently, protobuf SHOULD NOT be used for CID serialization due to varints and lack of fixed size arrays. Instead, identifiers use simple binary big endian serialization. @@ -105,13 +101,13 @@ Share containers encapsulate a set of data shares with [DAH][dah] inclusion proo #### Versioning -In case defined share container or identifier requires an incompatible change the new message type MAY be introduced -suffixed with new major version starting from v1. E.g. if Row message needs a revision, RowV1 is created. +If a defined share container or identifier requires an incompatible change, the new message type MAY be introduced +suffixed with a new major version starting from v1. E.g., if the Row message needs a revision, RowV1 is created. ### Messages -This section defines all the supported Shwap messages which includes share identifiers and share containers. All the new -future messages should be described in here. +This section defines all the supported Shwap messages, including share identifiers and containers. All the new +future messages should be described in the section. #### RowID @@ -133,12 +129,10 @@ The fields with validity rules that form RowID are: **RowIndex**: An uint16 representing row index points to a particular row. It MUST not exceed the number of Row roots in [DAH][dah]. -Serialized RowID MUST have length of 10 bytes. +Serialized RowID MUST have a length of 10 bytes. #### Row Container -Row containers encapsulate Row of the [DataSquare][square]. - Row containers are protobuf formatted using the following proto3 schema: ```protobuf @@ -154,8 +148,7 @@ The fields with validity rules that form Row containers are: [**RowID**](#rowid): A RowID of the Row Container. It MUST follow [RowID](#rowid) formatting and field validity rules. -**RowHalf**: A two-dimensional variable size byte arrays representing left half of shares in the row. It MUST be equal -to the number of Columns roots in [DAH][dah] divided by two. These shares MUST only be from the left half of the row. +**RowHalf**: A two-dimensional variable size byte arrays representing left half of shares in the row. It MUST equal the number of Columns roots in [DAH][dah] divided by two. These shares MUST only be from the left half of the row. The right half is computed using Leopard GF16 Reed-Solomon erasure-coding. Afterward, the [NMT][nmt] is built over both halves and the computed NMT root MUST be equal to the respective Row root in [DAH][dah]. @@ -176,10 +169,10 @@ The fields with validity rules that form SampleID are: [**RowID**](#rowid): A RowID of the sample. It MUST follow [RowID](#rowid) formatting and field validity rules. -**ColumnIndex**: A uint16 representing the column index of the sampled share; in other words share index in the row. It -MUST not exceed the number of Column roots in [DAH][dah]. +**ColumnIndex**: A uint16 representing the column index of the sampled share; in other words, the share index in the row. It +MUST stay within the number of Column roots in [DAH][dah]. -Serialized SampleID MUST have length of 12 bytes. +Serialized SampleID MUST have a length of 12 bytes. #### Sample Container @@ -190,7 +183,7 @@ Sample containers are protobuf formatted using the following proto3 schema: ```protobuf syntax = "proto3"; -message Sample { +message sample { bytes sample_id = 1; bytes sample_share = 2; Proof sample_proof = 3; @@ -212,9 +205,9 @@ validity rules. formatting and validity][shares-format] rules. **Proof**: A [protobuf formated][nmt-pb] [NMT][nmt] proof of share inclusion. It MUST follow [NMT proof verification][nmt-verify] -and be verified against the respective root from Row or Column axis in [DAH][dah]. The axis is defined by ProofType field. +and be verified against the respective root from the Row or Column axis in [DAH][dah]. The axis is defined by the ProofType field. -**ProofType**: An enum defining which root the Proof is coming from. It MUST be either RowProofType or ColumnProofType. +**ProofType**: An enum defining which axis root the Proof is coming from. It MUST be either RowProofType or ColumnProofType. #### DataID @@ -237,11 +230,11 @@ The fields with validity rules that form DataID are: [**Namespace**][ns]: A fixed-size bytes array representing the Namespace of interest. It MUST follow [Namespace][ns] formatting and its validity rules. -Serialized DataID MUST have length of 39 bytes. +Serialized DataID MUST have a length of 39 bytes. #### Data Container -Data containers encapsulate user submitted data under [namespaces][ns]. +Data containers encapsulate user-submitted data under [namespaces][ns]. Data containers are protobuf formatted using the following proto3 schema: @@ -264,53 +257,53 @@ rules. Each share MUST follow [share formatting and validity][shares-format] rules. **Proof**: A [protobuf formated][nmt-pb] [NMT][nmt] proof of share inclusion. It MUST follow [NMT proof verification][nmt-verify] -and be verified against the respective root from Row or Column axis in [DAH][dah]. The axis is defined by ProofType field. +and be verified against the respective root from the Row or Column axis in [DAH][dah]. The axis is defined by the ProofType field. -Namespace data may span over multiple rows in which case all the data is encapsulated in multiple containers. This is +Namespace data may span over multiple rows, in which case all the data is encapsulated in multiple containers. This is done ## Protocol Compositions -This sections specifies compositions of Shwap with other protocols. While Shwap is transport agnostic there are rough -edges on the protocol integration which every composition specifications has to describe. +This section specifies compositions of Shwap with other protocols. While Shwap is transport agnostic, there are rough +edges on the protocol integration, which every composition specification has to describe. ### Bitswap -[Bitswap][bitswap] is an application-level protocol designed for sharing verifiable data across peer-to-peer networks. +[Bitswap][bitswap] is an application-level protocol for sharing verifiable data across peer-to-peer networks. Bitswap operates as a dynamic want-list exchange among peers in a network. Peers continuously update and share their -want-lists of desired data in real-time. If at least one connected peer has the needed data, it is promptly fetched. +want lists of desired data in real time. It is promptly fetched if at least one connected peer has the needed data. This ongoing exchange ensures that as soon as any peer acquires the sought-after data, it can instantly share it with those in need. -Shwap is designed to be synergetic with Bitswap, as that's the primary composition to be deployed in Celestia's DA -network. Bitswap provides 1/N peers guarantee and can parallelize fetching across multiple peers. Both of these properties -greatly contribute to efficient DAS protocol of Celestia. +Shwap is designed to be synergetic with Bitswap, as that is the primary composition to be deployed in Celestia's DA +network. Bitswap provides the 1/N peers guarantee and can parallelize fetching across multiple peers. Both of these properties +significantly contribute to Celestia's efficient DAS protocol. -Bitswap runs over libp2p stack which provides QUIC transport integration. Subsequently, Shwap will benefit from features +Bitswap runs over the libp2p stack, which provides QUIC transport integration. Subsequently, Shwap will benefit from features libp2p provides together with transport protocol advancements introduced in QUIC. #### Multihashes and CID -Bitswap is tightly coupled with Multihash and CID notions establishing the [content addressability property][content-address]. -Bitswap operates over Blocks of data which are addressed and verified by CIDs. Basing on that, Shwap integrates into -Bitswap by complying to both of these interfaces. The [Share Containers](#share-containers) are Blocks which identified +Bitswap is tightly coupled with Multihash and CID notions, establishing the [content addressability property][content-address]. +Bitswap operates over Blocks of data that are addressed and verified by CIDs. Based on that, Shwap integrates into +Bitswap by complying with both of these interfaces. The [Share Containers](#share-containers) are Blocks that are identified via [Share Identifiers](#share-identifiers). -Even though Shwap takes inspiration from content addressability, it breaks-free from hash-based only model to optimize +Even though Shwap takes inspiration from content addressability, it breaks free from the hash-based model to optimize message sizes and data request patterns. In some way, it hacks into multihash abstraction to make it contain data that -isn't in fact a hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of +is not, in fact, a hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of the messages happens using externally provided data commitment. -This in turn creates a bunch complexities with the [reference Golang implementation][gimpl] that are necessary if forking -and diverging the upstream substantially is not an option. The naive question would be: "Why not to make content -verification after Bitswap provided it back over its API?" Intuitively, this would simplify a lot and wouldn't require -"hacking" CID. However, this has an important downside - the Bitswap in such case would consider the request finalized -and the content as fetched and valid, sending DONT_WANT message to its peers, while the message might still be invalid +This creates a bunch of complexities with the [reference Golang implementation][gimpl] that are necessary if forking +and substantially diverging the upstream is not an option. The naive question would be: "Why not make content +verification after Bitswap provided it back over its API?" Intuitively, this would simplify much and would not require +"hacking" CID. However, this has an important downside - the Bitswap, in such a case, would consider the request finalized +and the content as fetched and valid, sending a DONT_WANT message to its peers. In contrast, the message might still be invalid according to the verification rules. However, Bitswap still requires multihashes and CID codecs to be registered. Therefore, we provide a table for the supported [share identifiers](#share-identifiers) with their respective multihash and CID codec codes. This table -is supposed to be extended whenever any new share identifier is added. +should be extended whenever any new share identifier is added. | Name | Multihash | Codec | |----------|-----------|--------| @@ -322,7 +315,7 @@ is supposed to be extended whenever any new share identifier is added. Swap is incompatible with the old sampling protocol. -After rigorous investigation, celestia-node team decided against _implementing_ backward compatibility with +After rigorous investigation, the celestia-node team decided against _implementing_ backward compatibility with the old protocol into the node client due to the immense complications it brings. Instead, the simple and time-efficient strategy is transiently deploying infrastructure for old and new versions, allowing network participants to migrate gradually to the latest version. We will first deprecate the old version, and once the majority has migrated, we will @@ -337,12 +330,12 @@ protocol for data retrieval. Essentially, the network and its codebase get simplified and require less code and infrastructure to operate. This in turn decreases the amount of implementation vulnerabilities, DOS vectors, message amplification, and resource exhaustion attacks. -Although, new bug may be introduced as with any new protocol. +However, new bugs may be introduced, as with any new protocol. ### Protobuf Serialization -Protobuf is widely adopted serialization format and is used within Celestia's protocols. This was quite an obvious choice -for consistency reason, even though we could choose other more efficient and advanced formats like Cap'n Proto. +Protobuf is a widely adopted serialization format and is used within Celestia's protocols. This was quite an obvious choice +for consistency reasons, even though we could choose other more efficient and advanced formats like Cap'n Proto. ## Reference Implementation @@ -357,7 +350,6 @@ for consistency reason, even though we could choose other more efficient and adv [square]: https://celestiaorg.github.io/celestia-app/specs/data_structures.html#2d-reed-solomon-encoding-scheme [shares]: https://celestiaorg.github.io/celestia-app/specs/shares.html#abstract [shares-format]: https://celestiaorg.github.io/celestia-app/specs/shares.html#share-format -[blob]: https://celestiaorg.github.io/celestia-app/specs/data_square_layout.html#blob-share-commitment-rules [dah]: https://celestiaorg.github.io/celestia-app/specs/data_structures.html#availabledataheader [ns]: https://celestiaorg.github.io/celestia-app/specs/namespace.html#abstract [nmt]: https://github.com/celestiaorg/nmt/blob/master/docs/spec/nmt.md From 1a781462daabacd18edcbdb122b2c802f5c61094 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 18:46:59 +0100 Subject: [PATCH 08/17] migrate spec to CIP --- specs/src/SUMMARY.md | 2 +- specs/src/shwap/spec.md | 359 ---------------------------------------- 2 files changed, 1 insertion(+), 360 deletions(-) delete mode 100644 specs/src/shwap/spec.md diff --git a/specs/src/SUMMARY.md b/specs/src/SUMMARY.md index c5e5e92fa9..dd67b8e4dd 100644 --- a/specs/src/SUMMARY.md +++ b/specs/src/SUMMARY.md @@ -1,3 +1,3 @@ # Summary -- [Shwap](./shwap/spec.md) +- [Shwap](https://github.com/celestiaorg/CIPs/pull/77#issuecomment-1977130416) diff --git a/specs/src/shwap/spec.md b/specs/src/shwap/spec.md deleted file mode 100644 index 6b2e93c3e5..0000000000 --- a/specs/src/shwap/spec.md +++ /dev/null @@ -1,359 +0,0 @@ -# Shwap Protocol Specification - -## Abstract - -This document specifies Shwap - the simple and expressive yet extensible and future-proof messaging framework aiming to -solve critical inefficiencies and standardize messaging of Celestia's Data Availability p2p network. - -Shwap defines a messaging framework to be exchanged around the DA p2p network in a trust-minimized way without enforcing -transport(QUIC/TCP or IP) or application layer protocol semantics(e.g., HTTP/x). Using this framework, Shwap -declares the most common messages and provides options for stacking them with lower-level protocols. -Shwap can be stacked together with application protocol like HTTP/x, [KadDHT][kaddht], [Bitswap][bitswap] or any custom -protocol. - -## Motivation - -The current Data Availability Sampling (DAS) network protocol is inefficient. A _single_ sample operation takes log2(k) -network roundtrips (where k is the square size). This is not practical and does not scale for the theoretically unlimited -data square that the Celestia network enables. The main motive here is a protocol with O(1) roundtrip for _multiple_ -samples, preserving the assumption of having 1/n honest peers connected. - -Initially, Bitswap and IPLD were adopted as the basis for the DA network protocols, including DAS, -block synchronization (BS), and blob/namespace data retrieval (ND). They gave battle-tested protocols and tooling with -pluggability to rapidly scaffold Celestia's DA network. However, it came with the price of scalability limits and -roundtrips, resulting in slower BS than block production. Before the network launch, the transition -to the optimized [ShrEx protocol][shrex] for BS and integrating [CAR and DAGStore-based storage][storage] happened -optimizing BS and ND. However, DAS was left untouched, preserving its weak scalability and roundtrip inefficiency. - -Shwap messaging stacked together with Bitswap protocol directly addresses described inefficiency and provides a foundation -for efficient communication for BS, ND, and beyond. - -## Rationale - -The atomic primitive of Celestia's DA network is the share. Shwap standardizes messaging and serialization for shares. -Shares are grouped together, forming more complex data types(Rows, Blobs, etc.). These data types are encapsulated in -containers, e.g., Row container groups shares of a particular row. Containers can be identified with the share identifiers -in order to request, advertise or index the containers. The combination of containers and identifiers provides an extensible -and expressive messaging framework for groups of shares and enables efficient single roundtrip request-response -communication. - -Many share groups or containers are known in the Celestia network, and systemizing this is the main reason behind setting -up this simple messaging framework. A single place with all the possible Celestia DA messages must be defined, which node -software and protocol researchers can rely on and coordinate. Besides, this framework is designed to be -future-proof and sustain changes in the core protocol's data structures and proving system as long shares stay the -de facto atomic data type. - -Besides, there needs to be systematization and a joint knowledge base with all the edge cases for possible protocol -compositions of Shwap with lower-level protocols Bitswap, KadDHT, or Shrex, which Shwap aims to describe. - -## Specification - -### Terms and Definitions - -The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", -"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and -"OPTIONAL" in this document are to be interpreted as described in BCP -14 [RFC2119] [RFC8174] when, and only when, they appear in all -capitals, as shown here. - -Commonly used terms in this document are described below. - -_**Shwap**_: The protocol described by this document. Shwap is a -portmanteau name of words share and swap. - -_**[Share][shares]**_: The core data structure of DataSquare **"swapped"** between peers. - -_**[DataSquare][square]**_: The DA square format used by Celestia DA network. - -_**[DAH][dah]**_: The Data Availability Header with Row and Column commitments. - -_**[Namespace][ns]**_: The namespace grouping sets of shares. - -_**Peer**_: An entity that can participate in a Shwap protocol. There are three types of peers: -client, server, and node. - -_**Client**_: The Peer that requests content by content identifies over Shwap. - -_**Server**_: The Peer that responds with content over Shwap. - -_**Node**_: The peer that is both the client and the server. - -_**Proof**_: A Merkle inclusion proof of the data in the DataSquare. - -### Message Framework - -This section defines Shwap's messaging framework. Every group of shares that needs to be exchanged over the network -MUST define its [share identifier](#share-identifiers) and [share container](#share-containers), as well as, follow -their described rules. - -#### Share Identifiers - -Identifiers MUST have a fixed size for their fields. Subsequently, protobuf SHOULD NOT be used for CID serialization due -to varints and lack of fixed size arrays. Instead, identifiers use simple binary big endian serialization. - -Identifiers MAY embed each other to narrow down the scope of needed shares. For example, [SampleID](#sampleid) embeds -[RowID](#rowid) as every sample lay on a particular row. - -#### Share Containers - -Share containers encapsulate a set of data shares with [DAH][dah] inclusion proof. Share containers are identified by -[share identifiers](#share-identifiers). - -#### Versioning - -If a defined share container or identifier requires an incompatible change, the new message type MAY be introduced -suffixed with a new major version starting from v1. E.g., if the Row message needs a revision, RowV1 is created. - -### Messages - -This section defines all the supported Shwap messages, including share identifiers and containers. All the new -future messages should be described in the section. - -#### RowID - -RowID identifies the [Row shares container](#row-container) in a [DataSquare][square]. - -RowID identifiers are formatted as shown below: - -```text -RowID { - Height: u64; - RowIndex: u16; -} -``` - -The fields with validity rules that form RowID are: - -**Height**: A uint64 representing the chain height with the data square. It MUST be bigger than zero. - -**RowIndex**: An uint16 representing row index points to a particular row. It MUST not exceed the number of Row roots in -[DAH][dah]. - -Serialized RowID MUST have a length of 10 bytes. - -#### Row Container - -Row containers are protobuf formatted using the following proto3 schema: - -```protobuf -syntax = "proto3"; - -message Row { - bytes row_id = 1; - repeated bytes row_half = 2; -} -``` - -The fields with validity rules that form Row containers are: - -[**RowID**](#rowid): A RowID of the Row Container. It MUST follow [RowID](#rowid) formatting and field validity rules. - -**RowHalf**: A two-dimensional variable size byte arrays representing left half of shares in the row. It MUST equal the number of Columns roots in [DAH][dah] divided by two. These shares MUST only be from the left half of the row. -The right half is computed using Leopard GF16 Reed-Solomon erasure-coding. Afterward, the [NMT][nmt] is built over both -halves and the computed NMT root MUST be equal to the respective Row root in [DAH][dah]. - -#### SampleID - -SampleID identifies a Sample container of a single share in a [DataSquare][square]. - -SampleID identifiers are formatted as shown below: - -```text -SampleID { - RowID; - ColumnIndex: u16; -} -``` - -The fields with validity rules that form SampleID are: - -[**RowID**](#rowid): A RowID of the sample. It MUST follow [RowID](#rowid) formatting and field validity rules. - -**ColumnIndex**: A uint16 representing the column index of the sampled share; in other words, the share index in the row. It -MUST stay within the number of Column roots in [DAH][dah]. - -Serialized SampleID MUST have a length of 12 bytes. - -#### Sample Container - -Sample containers encapsulate single shares of the [DataSquare][square]. - -Sample containers are protobuf formatted using the following proto3 schema: - -```protobuf -syntax = "proto3"; - -message sample { - bytes sample_id = 1; - bytes sample_share = 2; - Proof sample_proof = 3; - ProofType proof_type = 4; -} - -enum ProofType { - RowProofType = 0; - ColProofType = 1; -} -``` - -The fields with validity rules that form Sample containers are: - -[**SampleID**](#sampleid): A SampleID of the Sample container. It MUST follow [SampleID](#sampleid) formatting and field -validity rules. - -**SampleShare**: A variable size array representing the share contained in the sample. Each share MUST follow [share -formatting and validity][shares-format] rules. - -**Proof**: A [protobuf formated][nmt-pb] [NMT][nmt] proof of share inclusion. It MUST follow [NMT proof verification][nmt-verify] -and be verified against the respective root from the Row or Column axis in [DAH][dah]. The axis is defined by the ProofType field. - -**ProofType**: An enum defining which axis root the Proof is coming from. It MUST be either RowProofType or ColumnProofType. - -#### DataID - -DataID identifies [namespace][ns] Data container of shares within a _single_ Row. That is, namespace shares spanning -over multiple Rows are identified with multiple identifiers. - -DataID identifiers are formatted as shown below: - -```text -DataID { - RowID; - Namespace; -} -``` - -The fields with validity rules that form DataID are: - -[**RowID**](#rowid): A RowID of the namespace data. It MUST follow [RowID](#rowid) formatting and field validity rules. - -[**Namespace**][ns]: A fixed-size bytes array representing the Namespace of interest. It MUST follow [Namespace][ns] -formatting and its validity rules. - -Serialized DataID MUST have a length of 39 bytes. - -#### Data Container - -Data containers encapsulate user-submitted data under [namespaces][ns]. - -Data containers are protobuf formatted using the following proto3 schema: - -```protobuf -syntax = "proto3"; - -message Data { - bytes data_id = 1; - repeated bytes data_shares = 2; - Proof data_proof = 3; -} -``` - -The fields with validity rules that form Data containers are: - -[**DataID**](#dataid): A DataID of the Data container. It MUST follow [DataID](#dataid) formatting and field validity -rules. - -**DataShares**: A two-dimensional variable size byte arrays representing left data shares of a namespace in the row. -Each share MUST follow [share formatting and validity][shares-format] rules. - -**Proof**: A [protobuf formated][nmt-pb] [NMT][nmt] proof of share inclusion. It MUST follow [NMT proof verification][nmt-verify] -and be verified against the respective root from the Row or Column axis in [DAH][dah]. The axis is defined by the ProofType field. - -Namespace data may span over multiple rows, in which case all the data is encapsulated in multiple containers. This is -done - -## Protocol Compositions - -This section specifies compositions of Shwap with other protocols. While Shwap is transport agnostic, there are rough -edges on the protocol integration, which every composition specification has to describe. - -### Bitswap - -[Bitswap][bitswap] is an application-level protocol for sharing verifiable data across peer-to-peer networks. -Bitswap operates as a dynamic want-list exchange among peers in a network. Peers continuously update and share their -want lists of desired data in real time. It is promptly fetched if at least one connected peer has the needed data. -This ongoing exchange ensures that as soon as any peer acquires the sought-after data, it can instantly share it with -those in need. - -Shwap is designed to be synergetic with Bitswap, as that is the primary composition to be deployed in Celestia's DA -network. Bitswap provides the 1/N peers guarantee and can parallelize fetching across multiple peers. Both of these properties -significantly contribute to Celestia's efficient DAS protocol. - -Bitswap runs over the libp2p stack, which provides QUIC transport integration. Subsequently, Shwap will benefit from features -libp2p provides together with transport protocol advancements introduced in QUIC. - -#### Multihashes and CID - -Bitswap is tightly coupled with Multihash and CID notions, establishing the [content addressability property][content-address]. -Bitswap operates over Blocks of data that are addressed and verified by CIDs. Based on that, Shwap integrates into -Bitswap by complying with both of these interfaces. The [Share Containers](#share-containers) are Blocks that are identified -via [Share Identifiers](#share-identifiers). - -Even though Shwap takes inspiration from content addressability, it breaks free from the hash-based model to optimize -message sizes and data request patterns. In some way, it hacks into multihash abstraction to make it contain data that -is not, in fact, a hash. Furthermore, the protocol does not include hash digests in the multihashes. The authentication of -the messages happens using externally provided data commitment. - -This creates a bunch of complexities with the [reference Golang implementation][gimpl] that are necessary if forking -and substantially diverging the upstream is not an option. The naive question would be: "Why not make content -verification after Bitswap provided it back over its API?" Intuitively, this would simplify much and would not require -"hacking" CID. However, this has an important downside - the Bitswap, in such a case, would consider the request finalized -and the content as fetched and valid, sending a DONT_WANT message to its peers. In contrast, the message might still be invalid -according to the verification rules. - -However, Bitswap still requires multihashes and CID codecs to be registered. Therefore, we provide a table for the -supported [share identifiers](#share-identifiers) with their respective multihash and CID codec codes. This table -should be extended whenever any new share identifier is added. - -| Name | Multihash | Codec | -|----------|-----------|--------| -| RowID | 0x7811 | 0x7810 | -| SampleID | 0x7801 | 0x7800 | -| DataID | 0x7821 | 0x7820 | - -## Backwards Compatibility - -Swap is incompatible with the old sampling protocol. - -After rigorous investigation, the celestia-node team decided against _implementing_ backward compatibility with -the old protocol into the node client due to the immense complications it brings. Instead, the simple and time-efficient -strategy is transiently deploying infrastructure for old and new versions, allowing network participants to migrate -gradually to the latest version. We will first deprecate the old version, and once the majority has migrated, we will -terminate the old infrastructure. - -## Considerations - -### Security - -Shwap does not change the security model of Celestia's Data Availability network and changes the underlying -protocol for data retrieval. - -Essentially, the network and its codebase get simplified and require less code and infrastructure to operate. This in turn -decreases the amount of implementation vulnerabilities, DOS vectors, message amplification, and resource exhaustion attacks. -However, new bugs may be introduced, as with any new protocol. - -### Protobuf Serialization - -Protobuf is a widely adopted serialization format and is used within Celestia's protocols. This was quite an obvious choice -for consistency reasons, even though we could choose other more efficient and advanced formats like Cap'n Proto. - -## Reference Implementation - -- [Go reference implementation with Bitswap composition][gimpl] -- [Rust implementation with Bitswap composition][rimpl] - -[shrex]: https://github.com/celestiaorg/celestia-node/blob/0abd16bbb05bf3016595498844a588ef55c63d2d/docs/adr/adr-013-blocksync-overhaul-part-2.md -[storage]: https://github.com/celestiaorg/celestia-node/blob/a33c80e20da684d656c7213580be7878bcd27cf4/docs/adr/adr-011-blocksync-overhaul-part-1.md -[bitswap]: https://docs.ipfs.tech/concepts/bitswap/ -[content-address]: https://fission.codes/blog/content-addressing-what-it-is-and-how-it-works/ -[kaddht]: https://pdos.csail.mit.edu/~petar/papers/maymounkov-kademlia-lncs.pdf -[square]: https://celestiaorg.github.io/celestia-app/specs/data_structures.html#2d-reed-solomon-encoding-scheme -[shares]: https://celestiaorg.github.io/celestia-app/specs/shares.html#abstract -[shares-format]: https://celestiaorg.github.io/celestia-app/specs/shares.html#share-format -[dah]: https://celestiaorg.github.io/celestia-app/specs/data_structures.html#availabledataheader -[ns]: https://celestiaorg.github.io/celestia-app/specs/namespace.html#abstract -[nmt]: https://github.com/celestiaorg/nmt/blob/master/docs/spec/nmt.md -[nmt-pb]: https://github.com/celestiaorg/nmt/blob/f5556676429118db8eeb5fc396a2c75ab12b5f20/pb/proof.proto -[nmt-verify]: https://github.com/celestiaorg/nmt/blob/master/docs/spec/nmt.md#namespace-proof-verification -[gimpl]: https://github.com/celestiaorg/celestia-node/pull/2675 -[rimpl]: https://github.com/eigerco/lumina/blob/561640072114fa5c4ed807e94882473476a41dda/node/src/p2p/shwap.rs From 15d620cef73076224c106a608db70c2e807b7abf Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 18:48:42 +0100 Subject: [PATCH 09/17] migrate spec to CIP --- specs/src/SUMMARY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specs/src/SUMMARY.md b/specs/src/SUMMARY.md index dd67b8e4dd..a16e57c1b7 100644 --- a/specs/src/SUMMARY.md +++ b/specs/src/SUMMARY.md @@ -1,3 +1,3 @@ # Summary -- [Shwap](https://github.com/celestiaorg/CIPs/pull/77#issuecomment-1977130416) +- [Shwap](https://github.com/Wondertan/CIPs/blob/cip-shwap_protocol/cips/cip-shwap_protocol.md) From da9a7117e18fede6c008c91f337fb7f58020e38f Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 21:27:56 +0100 Subject: [PATCH 10/17] prettify spec rendering --- specs/.gitignore | 1 + specs/book.toml | 10 +++++++--- specs/src/SUMMARY.md | 2 +- specs/src/shwap/spec.md | 1 + 4 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 specs/src/shwap/spec.md diff --git a/specs/.gitignore b/specs/.gitignore index 7585238efe..736a6b8c1a 100644 --- a/specs/.gitignore +++ b/specs/.gitignore @@ -1 +1,2 @@ book +theme \ No newline at end of file diff --git a/specs/book.toml b/specs/book.toml index 2ab3d5a398..8af841a922 100644 --- a/specs/book.toml +++ b/specs/book.toml @@ -5,9 +5,13 @@ multilingual = false src = "src" title = "Celestia Node Specification" +[preprocessor.webinclude] # Necessary to retrieve and render external documents like CIPs +[preprocessor.yml-header] # Necessary to remove front-matter headers that mdbook cannot render +[preprocessor.pagetoc] # Generates nice and handy toc on the left side of a page + [output.html] git-repository-url = "https://github.com/celestiaorg/celestia-node" -[preprocessor.toc] -command = "mdbook-toc" -renderer = ["html"] +# Necessary for pagetoc to work +additional-css = ["theme/pagetoc.css"] +additional-js = ["theme/pagetoc.js"] diff --git a/specs/src/SUMMARY.md b/specs/src/SUMMARY.md index a16e57c1b7..c5e5e92fa9 100644 --- a/specs/src/SUMMARY.md +++ b/specs/src/SUMMARY.md @@ -1,3 +1,3 @@ # Summary -- [Shwap](https://github.com/Wondertan/CIPs/blob/cip-shwap_protocol/cips/cip-shwap_protocol.md) +- [Shwap](./shwap/spec.md) diff --git a/specs/src/shwap/spec.md b/specs/src/shwap/spec.md new file mode 100644 index 0000000000..79c2115fad --- /dev/null +++ b/specs/src/shwap/spec.md @@ -0,0 +1 @@ +{{#webinclude https://raw.githubusercontent.com/Wondertan/CIPs/cip-shwap_protocol/cips/cip-shwap_protocol.md}} \ No newline at end of file From 4d9ce37b8b54d5e53b0d64680e4aa30046ce37cc Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 21:41:37 +0100 Subject: [PATCH 11/17] gh pages build preprocessers for mdbok --- .github/workflows/github_pages.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/github_pages.yml b/.github/workflows/github_pages.yml index b212c2d885..3c1f2d0924 100644 --- a/.github/workflows/github_pages.yml +++ b/.github/workflows/github_pages.yml @@ -25,6 +25,14 @@ jobs: with: mdbook-version: "latest" + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + - uses: actions-rs/cargo@v1 + with: + command: install + args: mdbook-webinclude mdbook-yml-header mdbool-pagetoc + - name: Build book run: mdbook build specs From a62ca9297ef44a40e905154247986ae677b750ef Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 21:51:51 +0100 Subject: [PATCH 12/17] try different action with caching --- .github/workflows/github_pages.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/github_pages.yml b/.github/workflows/github_pages.yml index 3c1f2d0924..1beb6b0228 100644 --- a/.github/workflows/github_pages.yml +++ b/.github/workflows/github_pages.yml @@ -25,13 +25,11 @@ jobs: with: mdbook-version: "latest" - - uses: actions-rs/toolchain@v1 + - uses: actions-rs/install@v0.1 with: - toolchain: stable - - uses: actions-rs/cargo@v1 - with: - command: install - args: mdbook-webinclude mdbook-yml-header mdbool-pagetoc + crate: mdbook-webinclude mdbook-yml-header mdbool-pagetoc + version: latest + use-tool-cache: true - name: Build book run: mdbook build specs From 417a6e74b0781d4276c4f6b630f4a3de5b94d53e Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 21:53:45 +0100 Subject: [PATCH 13/17] use separate step for each binary --- .github/workflows/github_pages.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/github_pages.yml b/.github/workflows/github_pages.yml index 1beb6b0228..8080c7b8af 100644 --- a/.github/workflows/github_pages.yml +++ b/.github/workflows/github_pages.yml @@ -27,7 +27,19 @@ jobs: - uses: actions-rs/install@v0.1 with: - crate: mdbook-webinclude mdbook-yml-header mdbool-pagetoc + crate: mdbook-webinclude + version: latest + use-tool-cache: true + + - uses: actions-rs/install@v0.1 + with: + crate: mdbook-yml-header + version: latest + use-tool-cache: true + + - uses: actions-rs/install@v0.1 + with: + crate: mdbool-pagetoc version: latest use-tool-cache: true From cfebc8aa345f5c8cb275ce55216b0b901e2d4da3 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 21:55:16 +0100 Subject: [PATCH 14/17] fix attempt --- .github/workflows/github_pages.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/github_pages.yml b/.github/workflows/github_pages.yml index 8080c7b8af..c09a79d49c 100644 --- a/.github/workflows/github_pages.yml +++ b/.github/workflows/github_pages.yml @@ -25,19 +25,22 @@ jobs: with: mdbook-version: "latest" - - uses: actions-rs/install@v0.1 + - name: Install mdbook-webinclude + uses: actions-rs/install@v0.1 with: crate: mdbook-webinclude version: latest use-tool-cache: true - - uses: actions-rs/install@v0.1 + - name: Install mdbook-yml-header + uses: actions-rs/install@v0.1 with: crate: mdbook-yml-header version: latest use-tool-cache: true - - uses: actions-rs/install@v0.1 + - name: Install mdbook-pagetoc + uses: actions-rs/install@v0.1 with: crate: mdbool-pagetoc version: latest From bae3b65399786432cf3c951c427c26e3ef3712ba Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 21:59:12 +0100 Subject: [PATCH 15/17] fix mdbook-pagetoc --- .github/workflows/github_pages.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github_pages.yml b/.github/workflows/github_pages.yml index c09a79d49c..6abc18cfa0 100644 --- a/.github/workflows/github_pages.yml +++ b/.github/workflows/github_pages.yml @@ -42,7 +42,7 @@ jobs: - name: Install mdbook-pagetoc uses: actions-rs/install@v0.1 with: - crate: mdbool-pagetoc + crate: mdbook-pagetoc version: latest use-tool-cache: true From 59587c3971ff25e40fb80031978f2a0ad8a75075 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 22:09:25 +0100 Subject: [PATCH 16/17] update step names and trigger rebuild of preprocessors to see caching in practice --- .github/workflows/github_pages.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/github_pages.yml b/.github/workflows/github_pages.yml index 6abc18cfa0..8df5d7a8ee 100644 --- a/.github/workflows/github_pages.yml +++ b/.github/workflows/github_pages.yml @@ -25,21 +25,21 @@ jobs: with: mdbook-version: "latest" - - name: Install mdbook-webinclude + - name: Install mdbook-webinclude preprocessor uses: actions-rs/install@v0.1 with: crate: mdbook-webinclude version: latest use-tool-cache: true - - name: Install mdbook-yml-header + - name: Install mdbook-yml-header preprocessor uses: actions-rs/install@v0.1 with: crate: mdbook-yml-header version: latest use-tool-cache: true - - name: Install mdbook-pagetoc + - name: Install mdbook-pagetoc preprocessor uses: actions-rs/install@v0.1 with: crate: mdbook-pagetoc From 6a523fc2221228dd90e0b010898b76c81f171bb4 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 4 Mar 2024 22:41:16 +0100 Subject: [PATCH 17/17] new line to gitignore --- specs/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specs/.gitignore b/specs/.gitignore index 736a6b8c1a..ce8e07100b 100644 --- a/specs/.gitignore +++ b/specs/.gitignore @@ -1,2 +1,2 @@ book -theme \ No newline at end of file +theme