Basic usage
Extract text from a document using a base64-encoded data URI:import os
import base64
from pathlib import Path
from dedalus_labs import Dedalus
client = Dedalus(
api_key=os.environ.get("DEDALUS_API_KEY")
)
# Read and encode the document
document_path = Path("/path/to/document.pdf")
with document_path.open("rb") as f:
document_bytes = f.read()
# Create data URI
media_type = "application/pdf" # or "image/png", "image/jpeg", etc.
document_b64 = base64.b64encode(document_bytes).decode('utf-8')
document_uri = f"data:{media_type};base64,{document_b64}"
# Process the document
response = client.ocr.process(
document={"document_url": document_uri}
)
print(response.text)
Async usage
Process documents asynchronously:import os
import base64
import asyncio
from pathlib import Path
from dedalus_labs import AsyncDedalus
client = AsyncDedalus(
api_key=os.environ.get("DEDALUS_API_KEY")
)
async def main():
# Prepare document
with Path("/path/to/document.pdf").open("rb") as f:
document_bytes = f.read()
document_uri = f"data:application/pdf;base64,{base64.b64encode(document_bytes).decode()}"
# Process document
response = await client.ocr.process(
document={"document_url": document_uri}
)
print(response.text)
asyncio.run(main())
Supported formats
The OCR API supports:
- PDFs - Multi-page documents
- Images - PNG, JPEG, WEBP, GIF, BMP
Helper function
Create a reusable helper to encode documents:import base64
from pathlib import Path
from typing import Dict
def encode_document(file_path: Path) -> Dict[str, str]:
"""Encode a document file to a data URI for OCR processing."""
# Determine media type from extension
media_types = {
".pdf": "application/pdf",
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".webp": "image/webp",
".gif": "image/gif",
".bmp": "image/bmp",
}
suffix = file_path.suffix.lower()
media_type = media_types.get(suffix, "application/octet-stream")
# Read and encode
with file_path.open("rb") as f:
document_bytes = f.read()
document_b64 = base64.b64encode(document_bytes).decode('utf-8')
document_uri = f"data:{media_type};base64,{document_b64}"
return {"document_url": document_uri}
# Usage
from dedalus_labs import Dedalus
client = Dedalus()
response = client.ocr.process(
document=encode_document(Path("/path/to/file.pdf"))
)
print(response.text)
Response structure
The OCR response includes extracted text and page-level information:from dedalus_labs import Dedalus
client = Dedalus()
response = client.ocr.process(
document=encode_document(Path("/path/to/document.pdf"))
)
# Access full text
print("Full text:")
print(response.text)
# Access page-by-page
if response.pages:
for i, page in enumerate(response.pages, 1):
print(f"\nPage {i}:")
print(page.text)
Specify model
Optionally specify which OCR model to use:response = client.ocr.process(
document=encode_document(Path("/path/to/document.pdf")),
model="mistral/pixtral-ocr" # Specify model explicitly
)
print(response.text)
Multi-page PDFs
Process multi-page PDFs and access individual pages:from pathlib import Path
from dedalus_labs import Dedalus
client = Dedalus()
response = client.ocr.process(
document=encode_document(Path("/path/to/multipage.pdf"))
)
print(f"Total pages: {len(response.pages) if response.pages else 0}")
print(f"\nFull document text length: {len(response.text)} characters")
# Process each page
if response.pages:
for page_num, page in enumerate(response.pages, 1):
print(f"\n--- Page {page_num} ---")
print(f"Characters: {len(page.text)}")
print(f"Preview: {page.text[:100]}...")
Markdown output
The OCR API returns markdown-formatted text, preserving document structure:from pathlib import Path
from dedalus_labs import Dedalus
client = Dedalus()
response = client.ocr.process(
document=encode_document(Path("/path/to/formatted_document.pdf"))
)
# Save as markdown file
output_path = Path("extracted_text.md")
output_path.write_text(response.text)
print(f"Markdown saved to {output_path}")
The OCR output preserves formatting elements like:
- Headings - Converted to markdown headings
- Lists - Bullet points and numbered lists
- Tables - Markdown table format
- Emphasis - Bold and italic text
Batch processing
Process multiple documents:from pathlib import Path
from dedalus_labs import Dedalus
client = Dedalus()
documents_dir = Path("/path/to/documents")
output_dir = Path("/path/to/output")
output_dir.mkdir(exist_ok=True)
for doc_path in documents_dir.glob("*.pdf"):
print(f"Processing {doc_path.name}...")
response = client.ocr.process(
document=encode_document(doc_path)
)
# Save extracted text
output_path = output_dir / f"{doc_path.stem}.md"
output_path.write_text(response.text)
print(f" Saved to {output_path.name}")
print(f" Extracted {len(response.text)} characters")
Image OCR
Extract text from images:from pathlib import Path
from dedalus_labs import Dedalus
client = Dedalus()
# Process an image file
response = client.ocr.process(
document=encode_document(Path("/path/to/screenshot.png"))
)
print("Extracted text from image:")
print(response.text)
Error handling
import dedalus_labs
from pathlib import Path
from dedalus_labs import Dedalus
client = Dedalus()
try:
response = client.ocr.process(
document=encode_document(Path("/path/to/document.pdf"))
)
print(response.text)
except FileNotFoundError:
print("Document file not found")
except dedalus_labs.BadRequestError as e:
print(f"Invalid request: {e.message}")
# Common causes: unsupported format, corrupt file, file too large
except dedalus_labs.APIConnectionError as e:
print("Network error occurred")
print(e.__cause__)
except dedalus_labs.APIStatusError as e:
print(f"API error: {e.status_code}")
Complete example
Full workflow with error handling and output:import base64
from pathlib import Path
from typing import Dict, Optional
from dedalus_labs import Dedalus
import dedalus_labs
def encode_document(file_path: Path) -> Dict[str, str]:
"""Encode document to data URI."""
media_types = {
".pdf": "application/pdf",
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
}
suffix = file_path.suffix.lower()
media_type = media_types.get(suffix, "application/octet-stream")
with file_path.open("rb") as f:
document_bytes = f.read()
document_b64 = base64.b64encode(document_bytes).decode('utf-8')
return {"document_url": f"data:{media_type};base64,{document_b64}"}
def extract_text(client: Dedalus, file_path: Path) -> Optional[str]:
"""Extract text from a document file."""
try:
response = client.ocr.process(
document=encode_document(file_path)
)
return response.text
except dedalus_labs.APIError as e:
print(f"Error processing {file_path.name}: {e}")
return None
# Main execution
client = Dedalus()
document_path = Path("/path/to/document.pdf")
text = extract_text(client, document_path)
if text:
print(f"Successfully extracted {len(text)} characters")
# Save to file
output_path = document_path.with_suffix(".md")
output_path.write_text(text)
print(f"Saved to {output_path}")
# Print preview
print("\nPreview:")
print(text[:500])
else:
print("Failed to extract text")
Large documents may take longer to process. Consider:
- Breaking very large PDFs into smaller chunks
- Using async processing for multiple documents
- Implementing retry logic for transient failures