Skip to main content

Method Signature

client.ocr.process(
    document: OCRDocumentParam,
    model: Optional[str] = None
) -> OCRResponse

Parameters

document
OCRDocumentParam
required
Document input for OCR processing.
model
str
Optional model identifier to use for OCR processing.

Response

model
str
The model used for OCR processing.
pages
List[OCRPage]
List of processed pages from the document.
usage
Dict[str, object]
Usage statistics for the OCR request.

Examples

from dedalus_labs import DedalusLabs
import base64

client = DedalusLabs()

# Process a PDF document
with open("document.pdf", "rb") as f:
    pdf_data = base64.b64encode(f.read()).decode("utf-8")

response = client.ocr.process(
    document={
        "document_url": f"data:application/pdf;base64,{pdf_data}"
    }
)

for page in response.pages:
    print(f"Page {page.index}:")
    print(page.markdown)
    print("---")
# Process an image file
with open("receipt.png", "rb") as f:
    image_data = base64.b64encode(f.read()).decode("utf-8")

response = client.ocr.process(
    document={
        "document_url": f"data:image/png;base64,{image_data}",
        "type": "receipt"
    }
)

print("Extracted text:")
print(response.pages[0].markdown)
# Process multiple pages and save markdown output
with open("report.pdf", "rb") as f:
    pdf_data = base64.b64encode(f.read()).decode("utf-8")

response = client.ocr.process(
    document={
        "document_url": f"data:application/pdf;base64,{pdf_data}"
    }
)

with open("output.md", "w") as f:
    for page in response.pages:
        f.write(f"# Page {page.index + 1}\n\n")
        f.write(page.markdown)
        f.write("\n\n")

print(f"Processed {len(response.pages)} pages")
print(f"Model used: {response.model}")
# Process a JPEG image
import base64
from pathlib import Path

image_path = Path("scan.jpg")
image_data = base64.b64encode(image_path.read_bytes()).decode("utf-8")

response = client.ocr.process(
    document={
        "document_url": f"data:image/jpeg;base64,{image_data}"
    },
    model="mistral-ocr-latest"
)

markdown_text = response.pages[0].markdown
print(markdown_text)
# Helper function to process any file
def process_document_ocr(file_path: str):
    """Process a document file with OCR and return markdown text."""
    from pathlib import Path
    import base64
    import mimetypes
    
    path = Path(file_path)
    mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream"
    
    file_data = base64.b64encode(path.read_bytes()).decode("utf-8")
    
    response = client.ocr.process(
        document={
            "document_url": f"data:{mime_type};base64,{file_data}"
        }
    )
    
    return "\n\n".join(page.markdown for page in response.pages)

# Use the helper function
text = process_document_ocr("invoice.pdf")
print(text)

Build docs developers (and LLMs) love