Use checkpoints to avoid re-parsing a document when running extraction after conversion. First convert with save_checkpoint=True, then extract using the returned checkpoint_id:
import jsonfrom datalab_sdk import DatalabClient, ConvertOptions, ExtractOptionsclient = DatalabClient()# Step 1: Convert and save a checkpointconvert_options = ConvertOptions( mode="accurate", save_checkpoint=True,)convert_result = client.convert("report.pdf", options=convert_options)print(convert_result.markdown)# Step 2: Extract using the checkpoint (no re-parsing needed)page_schema = json.dumps({ "title": {"type": "string", "description": "Document title"}, "author": {"type": "string", "description": "Author name"}, "date": {"type": "string", "description": "Publication date"}, "summary": {"type": "string", "description": "Brief summary of the document"},})extract_options = ExtractOptions( page_schema=page_schema, checkpoint_id=convert_result.checkpoint_id,)extract_result = client.extract("report.pdf", options=extract_options)extracted = json.loads(extract_result.extraction_schema_json)print(extracted)
The result object contains the extracted data alongside standard conversion fields:
result = client.extract("invoice.pdf", options=options)# Extracted structured data (JSON string)extracted = json.loads(result.extraction_schema_json)print(extracted["invoice_number"])print(extracted["total"])# Standard conversion fields are also availableprint(result.success)print(result.markdown)print(result.page_count)print(result.cost_breakdown)