Handling Long Documents - Datalab Documentation

For long documents, use page ranges and document segmentation to improve speed and accuracy.

Restrict to Specific Pages

If you know which pages contain the data you need, use page_range:

from datalab_sdk import DatalabClient, ConvertOptions
import json

client = DatalabClient()

schema = {
    "type": "object",
    "properties": {
        "executive_summary": {"type": "string", "description": "Executive summary text"}
    }
}

# Only process pages 0-5 (first 6 pages)
options = ConvertOptions(
    page_schema=json.dumps(schema),
    page_range="0-5",
    mode="balanced"
)

result = client.convert("long_document.pdf", options=options)

You’re only charged for the pages you process.

Segment and Chain Extractions

For documents with distinct sections (like financial reports or contracts), extract the table of contents first, then process each section separately.

Step 1: Extract Table of Contents

import json
from datalab_sdk import DatalabClient, ConvertOptions

client = DatalabClient()

toc_schema = {
    "type": "object",
    "properties": {
        "table_of_contents": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "section_name": {"type": "string"},
                    "page_number": {"type": "number"}
                }
            }
        }
    }
}

# Extract TOC from first few pages
options = ConvertOptions(
    page_schema=json.dumps(toc_schema),
    page_range="0-5",
    mode="balanced"
)

result = client.convert("report.pdf", options=options)
toc = json.loads(result.extraction_schema_json)

print("Sections found:")
for item in toc["table_of_contents"]:
    print(f"  {item['section_name']}: page {item['page_number']}")

Step 2: Extract Each Section

# Define schemas for different sections
section_schemas = {
    "Financial Highlights": {
        "type": "object",
        "properties": {
            "revenue": {"type": "number"},
            "net_income": {"type": "number"},
            "eps": {"type": "number"}
        }
    },
    "Risk Factors": {
        "type": "object",
        "properties": {
            "risks": {
                "type": "array",
                "items": {"type": "string"}
            }
        }
    }
}

# Build page ranges from TOC
sections = toc["table_of_contents"]
results = {}

for i, section in enumerate(sections):
    section_name = section["section_name"]
    start_page = section["page_number"]

    # End page is start of next section (or end of document)
    end_page = sections[i + 1]["page_number"] - 1 if i + 1 < len(sections) else None

    # Get schema for this section if we have one
    schema = section_schemas.get(section_name)
    if schema:
        page_range = f"{start_page}-{end_page}" if end_page else str(start_page)

        options = ConvertOptions(
            page_schema=json.dumps(schema),
            page_range=page_range,
            mode="balanced"
        )

        result = client.convert("report.pdf", options=options)
        results[section_name] = json.loads(result.extraction_schema_json)

print(results)

Use Document Segmentation

For documents without a clear table of contents, use Document Segmentation to automatically split by section headers.

segmentation_schema = {
    "type": "object",
    "properties": {
        "sections": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "title": {"type": "string"},
                    "type": {"type": "string", "enum": ["introduction", "methods", "results", "conclusion"]}
                }
            }
        }
    }
}

options = ConvertOptions(
    segmentation_schema=json.dumps(segmentation_schema),
    mode="balanced"
)

result = client.convert("paper.pdf", options=options)
# Access segmentation results
segments = result.segmentation_results

Full Example

Complete workflow for processing a 100+ page financial report:

import json
from datalab_sdk import DatalabClient, ConvertOptions

client = DatalabClient()


def extract_with_toc(pdf_path: str, section_schemas: dict) -> dict:
    """Extract data from a long document using TOC-based segmentation."""

    # Step 1: Extract table of contents
    toc_schema = {
        "type": "object",
        "properties": {
            "table_of_contents": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "section_name": {"type": "string"},
                        "page_number": {"type": "number"}
                    }
                }
            }
        }
    }

    options = ConvertOptions(
        page_schema=json.dumps(toc_schema),
        page_range="0-6",
        mode="balanced"
    )

    result = client.convert(pdf_path, options=options)
    toc = json.loads(result.extraction_schema_json)
    sections = toc.get("table_of_contents", [])

    # Step 2: Extract each section with its schema
    results = {}

    for i, section in enumerate(sections):
        section_name = section["section_name"]
        start_page = int(section["page_number"])

        # Calculate page range
        if i + 1 < len(sections):
            end_page = int(sections[i + 1]["page_number"]) - 1
            page_range = f"{start_page}-{end_page}"
        else:
            page_range = str(start_page)

        # Check if we have a schema for this section
        schema = section_schemas.get(section_name)
        if not schema:
            continue

        options = ConvertOptions(
            page_schema=json.dumps(schema),
            page_range=page_range,
            mode="balanced"
        )

        try:
            result = client.convert(pdf_path, options=options)
            results[section_name] = json.loads(result.extraction_schema_json)
            print(f"Extracted: {section_name}")
        except Exception as e:
            print(f"Error extracting {section_name}: {e}")

    return results


# Define schemas for sections you care about
schemas = {
    "Financial Highlights": {
        "type": "object",
        "properties": {
            "total_revenue": {"type": "number", "description": "Total revenue"},
            "net_income": {"type": "number", "description": "Net income"},
            "year": {"type": "string", "description": "Fiscal year"}
        }
    },
    "Business Overview": {
        "type": "object",
        "properties": {
            "description": {"type": "string", "description": "Business description"},
            "products": {"type": "array", "items": {"type": "string"}}
        }
    }
}

results = extract_with_toc("annual_report.pdf", schemas)
print(json.dumps(results, indent=2))

Tips

Process pages you need - Use page_range to avoid processing unnecessary pages
Extract TOC first - Build page ranges dynamically from the document structure
Use appropriate modes - balanced is usually sufficient; use accurate for complex tables
Handle errors - Some sections may not match your schema exactly

Next Steps

Structured Extraction

Learn the full structured extraction API and schema options.

Document Segmentation

Automatically split documents by section headers.

Batch Processing

Process multiple long documents efficiently in parallel.

Pipelines

Chain processors into versioned, reusable pipelines.

Documentation Index

​Restrict to Specific Pages

​Segment and Chain Extractions

​Step 1: Extract Table of Contents

​Step 2: Extract Each Section

​Use Document Segmentation

​Full Example

​Tips

​Next Steps