Skip to content

Commit

Permalink
feat: make chunk size user defined
Browse files Browse the repository at this point in the history
BEAKING CHANGE: all APIs updated to have min/max_encryptrable bytes
passed in by the user.

This allows for varying the use of the lib.
  • Loading branch information
joshuef committed Sep 25, 2024
1 parent 7a113a0 commit 609327e
Show file tree
Hide file tree
Showing 7 changed files with 310 additions and 190 deletions.
10 changes: 8 additions & 2 deletions benches/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ use std::time::Duration;
// https://bheisler.github.io/criterion.rs/book/analysis.html#measurement
const SAMPLE_SIZE: usize = 20;

/// The maximum size (before compression) of an individual chunk of a file, defined as 1024kiB.
const MAX_CHUNK_SIZE: usize = 1024 * 1024;
/// The minimum size (before compression) of an individual chunk of a file, defined as 1B.
const MIN_CHUNK_SIZE: usize = 1;

fn custom_criterion() -> Criterion {
Criterion::default()
.measurement_time(Duration::from_secs(40))
Expand All @@ -63,7 +68,8 @@ fn write(b: &mut Bencher<'_>, bytes_len: usize) {
|| random_bytes(bytes_len),
// actual benchmark
|bytes| {
let (_data_map, _encrypted_chunks) = encrypt(bytes).unwrap();
let (_data_map, _encrypted_chunks) =
encrypt(bytes, MIN_CHUNK_SIZE, MAX_CHUNK_SIZE).unwrap();
},
BatchSize::SmallInput,
);
Expand All @@ -72,7 +78,7 @@ fn write(b: &mut Bencher<'_>, bytes_len: usize) {
fn read(b: &mut Bencher, bytes_len: usize) {
b.iter_batched(
// the setup
|| encrypt(random_bytes(bytes_len)).unwrap(),
|| encrypt(random_bytes(bytes_len), MIN_CHUNK_SIZE, MAX_CHUNK_SIZE).unwrap(),
// actual benchmark
|(data_map, encrypted_chunks)| {
let _raw_data = decrypt_full_set(&data_map, &encrypted_chunks).unwrap();
Expand Down
8 changes: 7 additions & 1 deletion examples/basic_encryptor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ fn file_name(name: XorName) -> String {
string
}

/// The maximum size (before compression) of an individual chunk of a file, defined as 1024kiB.
const MAX_CHUNK_SIZE: usize = 1024 * 1024;
/// The minimum size (before compression) of an individual chunk of a file, defined as 1B.
const MIN_CHUNK_SIZE: usize = 1;

#[derive(Clone)]
struct DiskBasedStorage {
pub(crate) storage_path: String,
Expand Down Expand Up @@ -147,7 +152,8 @@ async fn main() {
Err(error) => return println!("{}", error),
}

let (data_map, encrypted_chunks) = encrypt(Bytes::from(data)).unwrap();
let (data_map, encrypted_chunks) =
encrypt(Bytes::from(data), MIN_CHUNK_SIZE, MAX_CHUNK_SIZE).unwrap();

let result = encrypted_chunks
.par_iter()
Expand Down
21 changes: 15 additions & 6 deletions src/chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,20 @@ pub struct RawChunk {

/// Hash all the chunks.
/// Creates [num cores] batches.
pub(crate) fn batch_chunks(bytes: Bytes) -> (usize, Vec<EncryptionBatch>) {
pub(crate) fn batch_chunks(
bytes: Bytes,
min_chunk_size: usize,
max_chunk_size: usize,
) -> (usize, Vec<EncryptionBatch>) {
let data_size = bytes.len();
let num_chunks = get_num_chunks(data_size);
let num_chunks = get_num_chunks(data_size, min_chunk_size, max_chunk_size);

let raw_chunks: Vec<_> = (0..num_chunks)
.map(|index| (index, bytes.clone()))
.par_bridge()
.map(|(index, bytes)| {
let (start, end) = get_start_end_positions(data_size, index);
let (start, end) =
get_start_end_positions(data_size, index, min_chunk_size, max_chunk_size);
let data = bytes.slice(start..end);
let hash = XorName::from_content(data.as_ref());
RawChunk { index, data, hash }
Expand All @@ -63,10 +68,14 @@ pub(crate) fn batch_chunks(bytes: Bytes) -> (usize, Vec<EncryptionBatch>) {
}

/// Calculate (start_position, end_position) for each chunk for the input file size
pub(crate) fn batch_positions(data_size: usize) -> Vec<(usize, usize)> {
let num_chunks = get_num_chunks(data_size);
pub(crate) fn batch_positions(
data_size: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> Vec<(usize, usize)> {
let num_chunks = get_num_chunks(data_size, min_chunk_size, max_chunk_size);

(0..num_chunks)
.map(|index| get_start_end_positions(data_size, index))
.map(|index| get_start_end_positions(data_size, index, min_chunk_size, max_chunk_size))
.collect()
}
2 changes: 1 addition & 1 deletion src/data_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use xor_name::XorName;

/// Holds the information that is required to recover the content of the encrypted file.
/// This is held as a vector of `ChunkInfo`, i.e. a list of the file's chunk hashes.
/// Only files larger than 3072 bytes (3 * MIN_CHUNK_SIZE) can be self-encrypted.
/// Only files larger than 3072 bytes (3 * chunk size) can be self-encrypted.
/// Smaller files will have to be batched together.
#[derive(Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct DataMap(Vec<ChunkInfo>);
Expand Down
163 changes: 105 additions & 58 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
//! async fn main() {
//! let file_size = 10_000_000;
//! let bytes = random_bytes(file_size);
//!
//! if let Ok((_data_map, _encrypted_chunks)) = encrypt(bytes) {
//! const MAX_CHUNK_SIZE: usize = 1024 * 1024;
//! const MIN_CHUNK_SIZE: usize = 1;
//! if let Ok((_data_map, _encrypted_chunks)) = encrypt(bytes, MIN_CHUNK_SIZE, MAX_CHUNK_SIZE) {
//! // .. then persist the `encrypted_chunks`.
//! // Remember to keep `data_map` somewhere safe..!
//! }
Expand Down Expand Up @@ -123,12 +124,6 @@ use xor_name::XorName;
pub use bytes;
pub use xor_name;

/// The minimum size (before compression) of data to be self-encrypted, defined as 3B.
pub const MIN_ENCRYPTABLE_BYTES: usize = 3 * MIN_CHUNK_SIZE;
/// The maximum size (before compression) of an individual chunk of a file, defined as 500kiB.
pub const MAX_CHUNK_SIZE: usize = 512 * 1024;
/// The minimum size (before compression) of an individual chunk of a file, defined as 1B.
pub const MIN_CHUNK_SIZE: usize = 1;
/// Controls the compression-speed vs compression-density tradeoffs. The higher the quality, the
/// slower the compression. Range is 0 to 11.
pub const COMPRESSION_QUALITY: i32 = 6;
Expand Down Expand Up @@ -163,12 +158,17 @@ pub struct StreamSelfEncryptor {
impl StreamSelfEncryptor {
/// For encryption, return with an intialized streaming encryptor.
/// If a `chunk_dir` is provided, the encrypted_chunks will be written into the specified dir as well.
pub fn encrypt_from_file(file_path: PathBuf, chunk_dir: Option<PathBuf>) -> Result<Self> {
pub fn encrypt_from_file(
file_path: PathBuf,
chunk_dir: Option<PathBuf>,
min_chunk_size: usize,
max_chunk_size: usize,
) -> Result<Self> {
let file = File::open(&*file_path)?;
let metadata = file.metadata()?;
let file_size = metadata.len();

let batch_positions = batch_positions(file_size as usize);
let batch_positions = batch_positions(file_size as usize, min_chunk_size, max_chunk_size);

Ok(StreamSelfEncryptor {
file_path,
Expand Down Expand Up @@ -350,13 +350,18 @@ impl StreamSelfDecryptor {
}

/// Read a file from the disk to encrypt, and output the chunks to a given output directory if presents.
pub fn encrypt_from_file(file_path: &Path, output_dir: &Path) -> Result<(DataMap, Vec<XorName>)> {
pub fn encrypt_from_file(
file_path: &Path,
output_dir: &Path,
min_chunk_size: usize,
max_chunk_size: usize,
) -> Result<(DataMap, Vec<XorName>)> {
let mut file = File::open(file_path)?;
let mut bytes = Vec::new();
let _ = file.read_to_end(&mut bytes)?;
let bytes = Bytes::from(bytes);

let (data_map, encrypted_chunks) = encrypt(bytes)?;
let (data_map, encrypted_chunks) = encrypt(bytes, min_chunk_size, max_chunk_size)?;

let mut chunk_names = Vec::new();
for chunk in encrypted_chunks {
Expand Down Expand Up @@ -401,16 +406,21 @@ pub fn decrypt_from_chunk_files(
/// Encrypts a set of bytes and returns the encrypted data together with
/// the data map that is derived from the input data, and is used to later decrypt the encrypted data.
/// Returns an error if the size is too small for self-encryption.
/// Only files larger than 3072 bytes (3 * MIN_CHUNK_SIZE) can be self-encrypted.
/// Only files larger than 3072 bytes (3 * min_chunk_size) can be self-encrypted.
/// Smaller files will have to be batched together for self-encryption to work.
pub fn encrypt(bytes: Bytes) -> Result<(DataMap, Vec<EncryptedChunk>)> {
if (MIN_ENCRYPTABLE_BYTES) > bytes.len() {
pub fn encrypt(
bytes: Bytes,
min_chunk_size: usize,
max_chunk_size: usize,
) -> Result<(DataMap, Vec<EncryptedChunk>)> {
let min_encryptable_bytes = 3 * min_chunk_size;
if (min_encryptable_bytes) > bytes.len() {
return Err(Error::Generic(format!(
"Too small for self-encryption! Required size at least {}",
MIN_ENCRYPTABLE_BYTES
min_encryptable_bytes
)));
}
let (num_chunks, batches) = chunk::batch_chunks(bytes);
let (num_chunks, batches) = chunk::batch_chunks(bytes, min_chunk_size, max_chunk_size);
let (data_map, encrypted_chunks) = encrypt::encrypt(batches);
if num_chunks > encrypted_chunks.len() {
return Err(Error::Encryption);
Expand Down Expand Up @@ -480,13 +490,20 @@ pub struct SeekInfo {
/// It is used to first fetch chunks using the `index_range`.
/// Then the chunks are passed into `self_encryption::decrypt_range` together
/// with `relative_pos` from the `SeekInfo` instance, and the `len` to be read.
pub fn seek_info(file_size: usize, pos: usize, len: usize) -> SeekInfo {
let (start_index, end_index) = overlapped_chunks(file_size, pos, len);

let relative_pos = if start_index == 2 && file_size < 3 * MAX_CHUNK_SIZE {
pos - (2 * get_chunk_size(file_size, 0))
pub fn seek_info(
file_size: usize,
pos: usize,
len: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> SeekInfo {
let (start_index, end_index) =
overlapped_chunks(file_size, pos, len, min_chunk_size, max_chunk_size);

let relative_pos = if start_index == 2 && file_size < 3 * max_chunk_size {
pos - (2 * get_chunk_size(file_size, 0, min_chunk_size, max_chunk_size))
} else {
pos % get_chunk_size(file_size, start_index)
pos % get_chunk_size(file_size, start_index, min_chunk_size, max_chunk_size)
};

SeekInfo {
Expand All @@ -501,9 +518,15 @@ pub fn seek_info(file_size: usize, pos: usize, len: usize) -> SeekInfo {

/// Returns the chunk index range [start, end) that is overlapped by the byte range defined by `pos`
/// and `len`. Returns empty range if `file_size` is so small that there are no chunks.
fn overlapped_chunks(file_size: usize, pos: usize, len: usize) -> (usize, usize) {
fn overlapped_chunks(
file_size: usize,
pos: usize,
len: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> (usize, usize) {
// FIX THIS SHOULD NOT BE ALLOWED
if file_size < (3 * MIN_CHUNK_SIZE) || pos >= file_size || len == 0 {
if file_size < (3 * min_chunk_size) || pos >= file_size || len == 0 {
return (0, 0);
}

Expand All @@ -513,8 +536,8 @@ fn overlapped_chunks(file_size: usize, pos: usize, len: usize) -> (usize, usize)
None => file_size,
};

let start_index = get_chunk_index(file_size, pos);
let end_index = get_chunk_index(file_size, end);
let start_index = get_chunk_index(file_size, pos, min_chunk_size, max_chunk_size);
let end_index = get_chunk_index(file_size, end, min_chunk_size, max_chunk_size);

(start_index, end_index)
}
Expand Down Expand Up @@ -561,90 +584,114 @@ fn get_pki(src_hash: &XorName, n_1_src_hash: &XorName, n_2_src_hash: &XorName) -
}

// Returns the number of chunks according to file size.
fn get_num_chunks(file_size: usize) -> usize {
if file_size < (3 * MIN_CHUNK_SIZE) {
fn get_num_chunks(file_size: usize, min_chunk_size: usize, max_chunk_size: usize) -> usize {
if file_size < (3 * min_chunk_size) {
return 0;
}
if file_size < (3 * MAX_CHUNK_SIZE) {
if file_size < (3 * max_chunk_size) {
return 3;
}
if file_size % MAX_CHUNK_SIZE == 0 {
file_size / MAX_CHUNK_SIZE
if file_size % max_chunk_size == 0 {
file_size / max_chunk_size
} else {
(file_size / MAX_CHUNK_SIZE) + 1
(file_size / max_chunk_size) + 1
}
}

// Returns the size of a chunk according to file size.
fn get_chunk_size(file_size: usize, chunk_index: usize) -> usize {
if file_size < 3 * MIN_CHUNK_SIZE {
// Returns the size of a chunk according to file size and defined chunk sizes.
fn get_chunk_size(
file_size: usize,
chunk_index: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> usize {
if file_size < 3 * min_chunk_size {
return 0;
}
if file_size < 3 * MAX_CHUNK_SIZE {
if file_size < 3 * max_chunk_size {
if chunk_index < 2 {
return file_size / 3;
} else {
// When the file_size % 3 > 0, the third (last) chunk includes the remainder
return file_size - (2 * (file_size / 3));
}
}
let total_chunks = get_num_chunks(file_size);
let total_chunks = get_num_chunks(file_size, min_chunk_size, max_chunk_size);
if chunk_index < total_chunks - 2 {
return MAX_CHUNK_SIZE;
return max_chunk_size;
}
let remainder = file_size % MAX_CHUNK_SIZE;
let remainder = file_size % max_chunk_size;
let penultimate = (total_chunks - 2) == chunk_index;
if remainder == 0 {
return MAX_CHUNK_SIZE;
return max_chunk_size;
}
if remainder < MIN_CHUNK_SIZE {
if remainder < min_chunk_size {
if penultimate {
MAX_CHUNK_SIZE - MIN_CHUNK_SIZE
max_chunk_size - min_chunk_size
} else {
MIN_CHUNK_SIZE + remainder
min_chunk_size + remainder
}
} else if penultimate {
MAX_CHUNK_SIZE
max_chunk_size
} else {
remainder
}
}

// Returns the [start, end) half-open byte range of a chunk.
fn get_start_end_positions(file_size: usize, chunk_index: usize) -> (usize, usize) {
if get_num_chunks(file_size) == 0 {
fn get_start_end_positions(
file_size: usize,
chunk_index: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> (usize, usize) {
if get_num_chunks(file_size, min_chunk_size, max_chunk_size) == 0 {
return (0, 0);
}
let start = get_start_position(file_size, chunk_index);
(start, start + get_chunk_size(file_size, chunk_index))
let start = get_start_position(file_size, chunk_index, min_chunk_size, max_chunk_size);
(
start,
start + get_chunk_size(file_size, chunk_index, min_chunk_size, max_chunk_size),
)
}

fn get_start_position(file_size: usize, chunk_index: usize) -> usize {
let total_chunks = get_num_chunks(file_size);
fn get_start_position(
file_size: usize,
chunk_index: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> usize {
let total_chunks = get_num_chunks(file_size, min_chunk_size, max_chunk_size);
if total_chunks == 0 {
return 0;
}
let last = (total_chunks - 1) == chunk_index;
let first_chunk_size = get_chunk_size(file_size, 0);
let first_chunk_size = get_chunk_size(file_size, 0, min_chunk_size, max_chunk_size);
if last {
first_chunk_size * (chunk_index - 1) + get_chunk_size(file_size, chunk_index - 1)
first_chunk_size * (chunk_index - 1)
+ get_chunk_size(file_size, chunk_index - 1, min_chunk_size, max_chunk_size)
} else {
first_chunk_size * chunk_index
}
}

fn get_chunk_index(file_size: usize, position: usize) -> usize {
let num_chunks = get_num_chunks(file_size);
fn get_chunk_index(
file_size: usize,
position: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> usize {
let num_chunks = get_num_chunks(file_size, min_chunk_size, max_chunk_size);
if num_chunks == 0 {
return 0; // FIX THIS SHOULD NOT BE ALLOWED
}

let chunk_size = get_chunk_size(file_size, 0);
let chunk_size = get_chunk_size(file_size, 0, min_chunk_size, max_chunk_size);
let remainder = file_size % chunk_size;

if remainder == 0
|| remainder >= MIN_CHUNK_SIZE
|| position < file_size - remainder - MIN_CHUNK_SIZE
|| remainder >= min_chunk_size
|| position < file_size - remainder - min_chunk_size
{
usize::min(position / chunk_size, num_chunks - 1)
} else {
Expand Down
Loading

0 comments on commit 609327e

Please sign in to comment.