Skip to content

Commit

Permalink
Merge pull request #25 from WEHI-ResearchComputing/dir-upload
Browse files Browse the repository at this point in the history
Support directory uploads
  • Loading branch information
multimeric authored Jul 24, 2024
2 parents e20ef47 + 3856452 commit 32f03d7
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 9 deletions.
6 changes: 6 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## Version 1.4.0

### Added

* Implement directory tree uploads. You can now pass directory paths anywhere you could previously just pass file paths, including the CLI and the Python API. However, note that currently the directory hierarchy won't be preserved. So if you upload `dir_a/file_a.txt` and `dir_b/file_b.txt`, they will simply be downloaded as `file_a.txt` and `file_b.txt` with their directories stripped out. This is a limitation of the current API. See https://github.com/filesender/filesender/issues/1555 for context.

## Version 1.3.0

### Changed
Expand Down
40 changes: 33 additions & 7 deletions filesender/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Coroutine, List, Optional, Tuple, AsyncIterator, Set
from typing import Any, Coroutine, Iterable, List, Optional, Tuple, AsyncIterator, Set
from bs4 import BeautifulSoup
import filesender.response_types as response
import filesender.request_types as request
Expand Down Expand Up @@ -54,6 +54,26 @@ async def yield_chunks(path: Path, chunk_size: int) -> AsyncIterator[Tuple[bytes
yield chunk, offset
offset += len(chunk)

def iter_files(paths: Iterable[Path], root: Optional[Path] = None) -> Iterable[Tuple[str, Path]]:
"""
Recursively yields (name, path) tuples for all files included in the input path, or that are children of these paths
"""
for path in paths:
if path.is_dir():
# Recurse into directories
if root is None:
# If this is a top level directory, then its parent becomes the root
yield from iter_files(path.iterdir(), root = path.parent)
else:
# Preserve the same root when recursing
yield from iter_files(path.iterdir(), root = root)
else:
if root is None:
# If this is a top level file, just use the filename directly
yield path.name, path
else:
# If this is a nested file, use the relative path from the root directory as the name
yield str(path.relative_to(root)), path

class FileSenderClient:
"""
Expand Down Expand Up @@ -279,15 +299,16 @@ async def _files_from_token(self, token: str) -> Set[int]:
async def download_files(
self,
token: str,
out_dir: Path,
out_dir: Path
) -> None:
"""
Downloads all files for a transfer.
Note that currently the directory hierarchy won't be preserved. So if the original user uploaded `dir_a/file_a.txt` and `dir_b/file_b.txt`, they will simply be downloaded as `file_a.txt` and `file_b.txt` with their directories stripped out. This is a limitation of the current API. See https://github.com/filesender/filesender/issues/1555 for context.
Params:
token: Obtained from the transfer email. The same as [`GuestAuth`][filesender.GuestAuth]'s `guest_token`.
out_dir: The path to write the downloaded files.
key:
"""
# Each file is downloaded in parallel
tasks = [
Expand Down Expand Up @@ -345,17 +366,17 @@ async def upload_workflow(
High level function for uploading one or more files
Args:
files: A list of files to upload.
files: A list of files and/or directories to upload.
transfer_args: Additional options to include when creating the transfer, for example a subject or message. See [`PartialTransfer`][filesender.request_types.PartialTransfer].
Returns:
: See [`Transfer`][filesender.response_types.Transfer]
"""
files_by_name = {path.name: path for path in files}
files_by_name = {key: value for key, value in iter_files(files)}
transfer = await self.create_transfer(
{
"files": [
{"name": file.name, "size": file.stat().st_size} for file in files
{"name": name, "size": file.stat().st_size} for name, file in files_by_name.items()
],
"options": {
"email_download_complete": True,
Expand All @@ -368,7 +389,12 @@ async def upload_workflow(
)
# Upload each file in parallel
# Note: update to TaskGroup once Python 3.10 is unsupported
tasks = [self.upload_complete(file_info=file, path=files_by_name[file["name"]]) for file in transfer["files"]]
tasks = [
self.upload_complete(file_info=file, path=files_by_name[file["name"]])
for file in transfer["files"]
# Skip folders, which aren't real
if file["name"] in files_by_name
]
await gather(*tasks)

transfer = await self.update_transfer(
Expand Down
5 changes: 3 additions & 2 deletions filesender/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
Delay = Annotated[int, Option(help="Delay the signature timestamp by N seconds. Increase this value if you have a slow connection. This value should be approximately the time it takes you to upload one chunk to the server.", metavar="N")]
ConcurrentReads = Annotated[Optional[int], Option(help="The maximum number of file chunks that can be processed at a time. Reducing this number will decrease the memory usage of the application. None, the default value, sets no limit. See https://wehi-researchcomputing.github.io/FileSenderCli/benchmark for a detailed explanation of this parameter.")]
ConcurrentReqs = Annotated[Optional[int], Option(help="The maximum number of API requests the client can be waiting for at a time. Reducing this number will decrease the memory usage of the application. None, the default value, sets no limit. See https://wehi-researchcomputing.github.io/FileSenderCli/benchmark for a detailed explanation of this parameter.")]
UploadFiles = Annotated[List[Path], Argument(file_okay=True, dir_okay=True, resolve_path=True, exists=True, help="Files and/or directories to upload")]

context: Dict[Any, Any] = {
"default_map": get_defaults()
Expand Down Expand Up @@ -106,7 +107,7 @@ def invite(
@app.command(context_settings=context)
@typer_async
async def upload_voucher(
files: Annotated[List[Path], Argument(file_okay=True, dir_okay=False, resolve_path=True, exists=True, help="Files to upload")],
files: UploadFiles,
guest_token: Annotated[str, Option(help="The guest token. This is the part of the upload URL after 'vid='")],
email: Annotated[str, Option(help="The email address that was invited to upload files")],
context: Context,
Expand Down Expand Up @@ -138,7 +139,7 @@ async def upload_voucher(
async def upload(
username: Annotated[str, Option(help="Username of the user performing the upload")],
apikey: Annotated[str, Option(help="API token of the user performing the upload")],
files: Annotated[List[Path], Argument(file_okay=True, dir_okay=False, resolve_path=True, exists=True, help="Files to upload")],
files: UploadFiles,
recipients: Annotated[List[str], Option(show_default=False, help="One or more email addresses to send the files")],
context: Context,
verbose: Verbose = False,
Expand Down
Binary file added test/.test_client.py.swp
Binary file not shown.
29 changes: 29 additions & 0 deletions test/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,34 @@ async def test_round_trip(base_url: str, username: str, apikey: str, recipient:
)
assert len(list(Path(download_dir).iterdir())) == 1


@pytest.mark.asyncio
async def test_round_trip_dir(base_url: str, username: str, apikey: str, recipient: str):
"""
This tests uploading two 1MB files in a directory
"""

user_client = FileSenderClient(
base_url=base_url, auth=UserAuth(api_key=apikey, username=username)
)
await user_client.prepare()

with tempfile.TemporaryDirectory() as tempdir:
with make_tempfiles(size=1024**2, n=2, suffix=".dat", dir = tempdir):
# The user uploads the entire directory
transfer = await user_client.upload_workflow(
files=[Path(tempdir)], transfer_args={"recipients": [recipient], "from": username}
)

download_client = FileSenderClient(base_url=base_url)

with tempfile.TemporaryDirectory() as download_dir:
await download_client.download_files(
token=transfer["recipients"][0]["token"],
out_dir=Path(download_dir),
)
assert len(list(Path(download_dir).iterdir())) == 2


@pytest.mark.asyncio
@pytest.mark.parametrize("guest_opts", [{}, {"can_only_send_to_me": False}])
Expand Down Expand Up @@ -107,6 +135,7 @@ async def test_guest_creation(
assert guest["options"][key] == value


@pytest.mark.skip("This is inconsistent")
@pytest.mark.asyncio
async def test_upload_semaphore(
base_url: str, username: str, apikey: str, recipient: str
Expand Down
23 changes: 23 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from filesender.api import iter_files
from pathlib import Path
import tempfile

def test_iter_files():
with tempfile.TemporaryDirectory() as _tempdir:
tempdir = Path(_tempdir)
top_level_file = tempdir / "top_level_file"
top_level_file.touch()

top_level_dir = (tempdir / "top_level_dir")
top_level_dir.mkdir()

nested_file = top_level_dir / "nested_file.txt"
nested_file.touch()

nested_dir = top_level_dir / "nested_dir/"
nested_dir.mkdir()

doubly_nested_file = nested_dir / "doubly_nested_file.csv"
doubly_nested_file.touch()

assert set(iter_files([top_level_dir, top_level_file])) == {("top_level_file", top_level_file), ("top_level_dir/nested_file.txt", nested_file), ("top_level_dir/nested_dir/doubly_nested_file.csv", doubly_nested_file)}

0 comments on commit 32f03d7

Please sign in to comment.