Use checkpoints to avoid re-parsing a document when running segmentation after conversion. First convert with save_checkpoint=True, then segment using the returned checkpoint_id:
import jsonfrom datalab_sdk import DatalabClient, ConvertOptions, SegmentOptionsclient = DatalabClient()# Step 1: Convert and save a checkpointconvert_options = ConvertOptions( mode="accurate", save_checkpoint=True,)convert_result = client.convert("report.pdf", options=convert_options)print(convert_result.markdown)# Step 2: Segment using the checkpoint (no re-parsing needed)segmentation_schema = json.dumps({ "sections": [ {"name": "executive_summary", "description": "Executive summary"}, {"name": "financials", "description": "Financial data and analysis"}, {"name": "outlook", "description": "Future outlook and projections"}, ]})segment_options = SegmentOptions( segmentation_schema=segmentation_schema, checkpoint_id=convert_result.checkpoint_id,)segment_result = client.segment("report.pdf", options=segment_options)print(segment_result.segmentation_results)
The result object contains the segmentation data alongside standard conversion fields:
result = client.segment("document.pdf", options=options)# Segmentation results (list of segments with names and page ranges)segments = result.segmentation_resultsfor segment in segments: print(f"Section: {segment['name']}") print(f" Pages: {segment['page_range']}")# Standard conversion fields are also availableprint(result.success)print(result.markdown)print(result.page_count)print(result.cost_breakdown)