Documentation Index
Fetch the complete documentation index at: https://documentation.datalab.to/llms.txt
Use this file to discover all available pages before exploring further.
For long documents, use page ranges and document segmentation to improve speed and accuracy.
Restrict to Specific Pages
If you know which pages contain the data you need, use page_range:
from datalab_sdk import DatalabClient, ConvertOptions
import json
client = DatalabClient()
schema = {
"type": "object",
"properties": {
"executive_summary": {"type": "string", "description": "Executive summary text"}
}
}
# Only process pages 0-5 (first 6 pages)
options = ConvertOptions(
page_schema=json.dumps(schema),
page_range="0-5",
mode="balanced"
)
result = client.convert("long_document.pdf", options=options)
You’re only charged for the pages you process.
Segment and Chain Extractions
For documents with distinct sections (like financial reports or contracts), extract the table of contents first, then process each section separately.
import json
from datalab_sdk import DatalabClient, ConvertOptions
client = DatalabClient()
toc_schema = {
"type": "object",
"properties": {
"table_of_contents": {
"type": "array",
"items": {
"type": "object",
"properties": {
"section_name": {"type": "string"},
"page_number": {"type": "number"}
}
}
}
}
}
# Extract TOC from first few pages
options = ConvertOptions(
page_schema=json.dumps(toc_schema),
page_range="0-5",
mode="balanced"
)
result = client.convert("report.pdf", options=options)
toc = json.loads(result.extraction_schema_json)
print("Sections found:")
for item in toc["table_of_contents"]:
print(f" {item['section_name']}: page {item['page_number']}")
# Define schemas for different sections
section_schemas = {
"Financial Highlights": {
"type": "object",
"properties": {
"revenue": {"type": "number"},
"net_income": {"type": "number"},
"eps": {"type": "number"}
}
},
"Risk Factors": {
"type": "object",
"properties": {
"risks": {
"type": "array",
"items": {"type": "string"}
}
}
}
}
# Build page ranges from TOC
sections = toc["table_of_contents"]
results = {}
for i, section in enumerate(sections):
section_name = section["section_name"]
start_page = section["page_number"]
# End page is start of next section (or end of document)
end_page = sections[i + 1]["page_number"] - 1 if i + 1 < len(sections) else None
# Get schema for this section if we have one
schema = section_schemas.get(section_name)
if schema:
page_range = f"{start_page}-{end_page}" if end_page else str(start_page)
options = ConvertOptions(
page_schema=json.dumps(schema),
page_range=page_range,
mode="balanced"
)
result = client.convert("report.pdf", options=options)
results[section_name] = json.loads(result.extraction_schema_json)
print(results)
Use Document Segmentation
For documents without a clear table of contents, use Document Segmentation to automatically split by section headers.
segmentation_schema = {
"type": "object",
"properties": {
"sections": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"type": {"type": "string", "enum": ["introduction", "methods", "results", "conclusion"]}
}
}
}
}
}
options = ConvertOptions(
segmentation_schema=json.dumps(segmentation_schema),
mode="balanced"
)
result = client.convert("paper.pdf", options=options)
# Access segmentation results
segments = result.segmentation_results
Full Example
Complete workflow for processing a 100+ page financial report:
import json
from datalab_sdk import DatalabClient, ConvertOptions
client = DatalabClient()
def extract_with_toc(pdf_path: str, section_schemas: dict) -> dict:
"""Extract data from a long document using TOC-based segmentation."""
# Step 1: Extract table of contents
toc_schema = {
"type": "object",
"properties": {
"table_of_contents": {
"type": "array",
"items": {
"type": "object",
"properties": {
"section_name": {"type": "string"},
"page_number": {"type": "number"}
}
}
}
}
}
options = ConvertOptions(
page_schema=json.dumps(toc_schema),
page_range="0-6",
mode="balanced"
)
result = client.convert(pdf_path, options=options)
toc = json.loads(result.extraction_schema_json)
sections = toc.get("table_of_contents", [])
# Step 2: Extract each section with its schema
results = {}
for i, section in enumerate(sections):
section_name = section["section_name"]
start_page = int(section["page_number"])
# Calculate page range
if i + 1 < len(sections):
end_page = int(sections[i + 1]["page_number"]) - 1
page_range = f"{start_page}-{end_page}"
else:
page_range = str(start_page)
# Check if we have a schema for this section
schema = section_schemas.get(section_name)
if not schema:
continue
options = ConvertOptions(
page_schema=json.dumps(schema),
page_range=page_range,
mode="balanced"
)
try:
result = client.convert(pdf_path, options=options)
results[section_name] = json.loads(result.extraction_schema_json)
print(f"Extracted: {section_name}")
except Exception as e:
print(f"Error extracting {section_name}: {e}")
return results
# Define schemas for sections you care about
schemas = {
"Financial Highlights": {
"type": "object",
"properties": {
"total_revenue": {"type": "number", "description": "Total revenue"},
"net_income": {"type": "number", "description": "Net income"},
"year": {"type": "string", "description": "Fiscal year"}
}
},
"Business Overview": {
"type": "object",
"properties": {
"description": {"type": "string", "description": "Business description"},
"products": {"type": "array", "items": {"type": "string"}}
}
}
}
results = extract_with_toc("annual_report.pdf", schemas)
print(json.dumps(results, indent=2))
Tips
- Process pages you need - Use
page_range to avoid processing unnecessary pages
- Extract TOC first - Build page ranges dynamically from the document structure
- Use appropriate modes -
balanced is usually sufficient; use accurate for complex tables
- Handle errors - Some sections may not match your schema exactly
Next Steps
Structured Extraction
Learn the full structured extraction API and schema options.
Document Segmentation
Automatically split documents by section headers.
Batch Processing
Process multiple long documents efficiently in parallel.
Pipelines
Chain processors into versioned, reusable pipelines.