drmingler · jiaweing · Nov 30, 2024
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
+__pycache__/
 
 # Additional IDE files
 *.iml

diff --git a/README.md b/README.md
@@ -5,36 +5,41 @@
 
 ## Comparison to Other Parsing Libraries
 
-| Original PDF |
-|--------------|
+| Original PDF                                                                                                         |
+| -------------------------------------------------------------------------------------------------------------------- |
 | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/original.png" width="500"/> |
 
-| Docling-API | Marker |
-|-------------|--------|
+| Docling-API                                                                                                         | Marker                                                                                                             |
+| ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
 | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/docling.png" width="500"/> | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/marker.png" width="500"/> |
 
-| PyPDF | PyMuPDF4LLM |
-|-------|-------------|
+| PyPDF                                                                                                             | PyMuPDF4LLM                                                                                                         |
+| ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
 | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/pypdf.png" width="500"/> | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/pymupdf.png" width="500"/> |
 
 ## Features
+
 - **Multiple Format Support**: Converts various document types including:
+
   - PDF files
   - Microsoft Word documents (DOCX)
   - PowerPoint presentations (PPTX)
   - HTML files
   - Images (JPG, PNG, TIFF, BMP)
   - AsciiDoc files
   - Markdown files
+  - URLs
 
 - **Conversion Capabilities**:
+
   - Text extraction and formatting
   - Table detection, extraction and conversion
   - Image extraction and processing
   - Multi-language OCR support (French, German, Spanish, English, Italian, Portuguese etc)
   - Configurable image resolution scaling
 
 - **API Endpoints**:
+
   - Synchronous single document conversion
   - Synchronous batch document conversion
   - Asynchronous single document conversion with job tracking
@@ -49,39 +54,47 @@
 ## Environment Setup (Running Locally)
 
 ### Prerequisites
+
 - Python 3.8 or higher
 - Poetry (Python package manager)
 - Redis server (for task queue)
 
 ### 1. Install Poetry (if not already installed)
+
 ```bash
 curl -sSL https://install.python-poetry.org | python3 -
 ```
 
 ### 2. Clone and Setup Project
+
 ```bash
 git clone https://github.com/drmingler/docling-api.git
 cd document-converter
 poetry install
 ```
 
 ### 3. Configure Environment
+
 Create a `.env` file in the project root:
+
 ```bash
 REDIS_HOST=redis://localhost:6379/0
 ENV=development
 ```
 
 ### 4. Start Redis Server
+
 Start Redis locally (install if not already installed):
 
 #### For MacOS:
+
 ```bash
 brew install redis
 brew services start redis
 ```
 
 #### For Ubuntu/Debian:
+
 ```bash
 sudo apt-get install redis-server
 sudo service redis-server start
@@ -90,28 +103,33 @@ sudo service redis-server start
 ### 5. Start the Application Components
 
 1. Start the FastAPI server:
+
 ```bash
 poetry run uvicorn main:app --reload --port 8080
 ```
 
 2. Start Celery worker (in a new terminal):
+
 ```bash
 poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
 ```
 
 3. Start Flower dashboard for monitoring (optional, in a new terminal):
+
 ```bash
 poetry run celery -A worker.celery_config flower --port=5555
 ```
 
 ### 6. Verify Installation
 
 1. Check if the API server is running:
+
 ```bash
 curl http://localhost:8080/docs
 ```
 
 2. Test Celery worker:
+
 ```bash
 curl -X POST "http://localhost:8080/documents/convert" \
   -H "accept: application/json" \
@@ -120,6 +138,7 @@ curl -X POST "http://localhost:8080/documents/convert" \
 ```
 
 3. Access monitoring dashboard:
+
 - Open http://localhost:5555 in your browser to view the Flower dashboard
 
 ### Development Notes
@@ -132,25 +151,31 @@ curl -X POST "http://localhost:8080/documents/convert" \
 ## Environment Setup (Running in Docker)
 
 1. Clone the repository:
+
 ```bash
 git clone https://github.com/drmingler/docling-api.git
 cd document-converter
 ```
 
 2. Create a `.env` file:
+
 ```bash
 REDIS_HOST=redis://redis:6379/0
 ENV=production
 ```
 
 ### CPU Mode
+
 To start the service using CPU-only processing, use the following command. You can adjust the number of Celery workers by specifying the --scale option. In this example, 1 worker will be created:
+
 ```bash
 docker-compose -f docker-compose.cpu.yml up --build --scale celery_worker=1
 ```
 
 ### GPU Mode (Recommend for production)
+
 For production, it is recommended to enable GPU acceleration, as it significantly improves performance. Use the command below to start the service with GPU support. You can also scale the number of Celery workers using the --scale option; here, 3 workers will be launched:
+
 ```bash
 docker-compose -f docker-compose.gpu.yml up --build --scale celery_worker=3
 ```
@@ -237,9 +262,11 @@ The service uses a distributed architecture with the following components:
 - Multiple workers can be scaled horizontally for increased throughput
 
 ## License
+
 The codebase is under MIT license. See LICENSE for more information
 
 ## Acknowledgements
+
 - [Docling](https://github.com/DS4SD/docling) the state-of-the-art document conversion library by IBM
 - [FastAPI](https://fastapi.tiangolo.com/) the web framework
 - [Celery](https://docs.celeryq.dev/en/stable/) for distributed task processing

diff --git a/document_converter/route.py b/document_converter/route.py
@@ -1,7 +1,8 @@
 from io import BytesIO
 from multiprocessing.pool import AsyncResult
 from typing import List
-from fastapi import APIRouter, File, HTTPException, UploadFile, Query
+from fastapi import APIRouter, File, HTTPException, UploadFile, Query, Body
+from pydantic import HttpUrl
 
 from document_converter.schema import BatchConversionJobResult, ConversationJobResult, ConversionResult
 from document_converter.service import DocumentConverterService, DoclingDocumentConversion
@@ -23,16 +24,23 @@
     description="Convert a single document synchronously",
 )
 async def convert_single_document(
-    document: UploadFile = File(...),
+    document: UploadFile = File(None),
+    url: HttpUrl = Body(None),
     extract_tables_as_images: bool = False,
     image_resolution_scale: int = Query(4, ge=1, le=4),
 ):
-    file_bytes = await document.read()
-    if not is_file_format_supported(file_bytes, document.filename):
-        raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
+    if document:
+        file_bytes = await document.read()
+        if not is_file_format_supported(file_bytes, document.filename):
+            raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
+        doc_input = (document.filename, BytesIO(file_bytes))
+    elif url:
+        doc_input = str(url)
+    else:
+        raise HTTPException(status_code=400, detail="Either document or url must be provided")
 
     return document_converter_service.convert_document(
-        (document.filename, BytesIO(file_bytes)),
+        doc_input,
         extract_tables=extract_tables_as_images,
         image_resolution_scale=image_resolution_scale,
     )
@@ -45,19 +53,25 @@ async def convert_single_document(
     description="Convert multiple documents synchronously",
 )
 async def convert_multiple_documents(
-    documents: List[UploadFile] = File(...),
+    documents: List[UploadFile] = File(None),
+    urls: List[HttpUrl] = Body(None),
     extract_tables_as_images: bool = False,
     image_resolution_scale: int = Query(4, ge=1, le=4),
 ):
-    doc_streams = []
-    for document in documents:
-        file_bytes = await document.read()
-        if not is_file_format_supported(file_bytes, document.filename):
-            raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
-        doc_streams.append((document.filename, BytesIO(file_bytes)))
+    if documents:
+        doc_inputs = []
+        for document in documents:
+            file_bytes = await document.read()
+            if not is_file_format_supported(file_bytes, document.filename):
+                raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
+            doc_inputs.append((document.filename, BytesIO(file_bytes)))
+    elif urls:
+        doc_inputs = [str(url) for url in urls]
+    else:
+        raise HTTPException(status_code=400, detail="Either documents or urls must be provided")
 
     return document_converter_service.convert_documents(
-        doc_streams,
+        doc_inputs,
         extract_tables=extract_tables_as_images,
         image_resolution_scale=image_resolution_scale,
     )
@@ -70,16 +84,23 @@ async def convert_multiple_documents(
     description="Create a conversion job for a single document",
 )
 async def create_single_document_conversion_job(
-    document: UploadFile = File(...),
+    document: UploadFile = File(None),
+    url: HttpUrl = Body(None),
     extract_tables_as_images: bool = False,
     image_resolution_scale: int = Query(4, ge=1, le=4),
 ):
-    file_bytes = await document.read()
-    if not is_file_format_supported(file_bytes, document.filename):
-        raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
+    if document:
+        file_bytes = await document.read()
+        if not is_file_format_supported(file_bytes, document.filename):
+            raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
+        doc_input = (document.filename, file_bytes)
+    elif url:
+        doc_input = str(url)
+    else:
+        raise HTTPException(status_code=400, detail="Either document or url must be provided")
 
     task = convert_document_task.delay(
-        (document.filename, file_bytes),
+        doc_input,
         extract_tables=extract_tables_as_images,
         image_resolution_scale=image_resolution_scale,
     )
@@ -104,20 +125,25 @@ async def get_conversion_job_status(job_id: str):
     description="Create a conversion job for multiple documents",
 )
 async def create_batch_conversion_job(
-    documents: List[UploadFile] = File(...),
+    documents: List[UploadFile] = File(None),
+    urls: List[HttpUrl] = Body(None),
     extract_tables_as_images: bool = False,
     image_resolution_scale: int = Query(4, ge=1, le=4),
 ):
-    """Create a batch conversion job for multiple documents."""
-    doc_data = []
-    for document in documents:
-        file_bytes = await document.read()
-        if not is_file_format_supported(file_bytes, document.filename):
-            raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
-        doc_data.append((document.filename, file_bytes))
+    if documents:
+        doc_inputs = []
+        for document in documents:
+            file_bytes = await document.read()
+            if not is_file_format_supported(file_bytes, document.filename):
+                raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
+            doc_inputs.append((document.filename, file_bytes))
+    elif urls:
+        doc_inputs = [str(url) for url in urls]
+    else:
+        raise HTTPException(status_code=400, detail="Either documents or urls must be provided")
 
     task = convert_documents_task.delay(
-        doc_data,
+        doc_inputs,
         extract_tables=extract_tables_as_images,
         image_resolution_scale=image_resolution_scale,
     )