CAMEL - Build Multi-Agent AI Systems (original) (raw)
Chunkr Reader allows you to process PDFs (and other docs) in chunks, with built-in OCR and format control. Below is a basic usage pattern:Initialize the ChunkrReader and ChunkrReaderConfig, set the file path and chunking options, then submit your task and fetch results:
import asyncio
from camel.loaders import ChunkrReader, ChunkrReaderConfig
async def main():
chunkr = ChunkrReader()
config = ChunkrReaderConfig(
chunk_processing=512, # Example: target chunk length
ocr_strategy="Auto", # Example: OCR strategy
high_resolution=False # False for faster processing (old "Fast" model)
)
# Replace with your actual file path.
file_path = "/path/to/your/document.pdf"
try:
task_id = await chunkr.submit_task(
file_path=file_path,
chunkr_config=config,
)
print(f"Task ID: {task_id}")
# Poll and fetch the output.
if task_id:
task_output_json_str = await chunkr.get_task_output(task_id=task_id)
if task_output_json_str:
print("Task Output:")
print(task_output_json_str)
else:
print(f"Failed to get output for task {task_id}, or task did not succeed/was cancelled.")
except ValueError as e:
print(f"An error occurred during task submission or retrieval: {e}")
except FileNotFoundError:
print(f"Error: File not found at {file_path}. Please check the path.")
except Exception as e:
print(f"An unexpected error occurred: {e}")
if __name__ == "__main__":
print("To run this example, replace '/path/to/your/document.pdf' with a real file path, ensure CHUNKR_API_KEY is set, and uncomment 'asyncio.run(main())'.")
# asyncio.run(main()) # Uncomment to run the example
> > > Task ID: 7becf001-6f07-4f63-bddf-5633df363bbb
> > > Task Output:
> > > { "task_id": "7becf001-6f07-4f63-bddf-5633df363bbb", "status": "Succeeded", "created_at": "2024-11-08T12:45:04.260765Z", "finished_at": "2024-11-08T12:45:48.942365Z", "expires_at": null, "message": "Task succeeded", "output": { "chunks": [ { "segments": [ { "segment_id": "d53ec931-3779-41be-a220-3fe4da2770c5", "bbox": { "left": 224.16666, "top": 370.0, "width": 2101.6665, "height": 64.166664 }, "page_number": 1, "page_width": 2550.0, "page_height": 3300.0, "content": "Large Language Model based Multi-Agents: A Survey of Progress and Challenges", "segment_type": "Title", "ocr": null, "image": "https://chunkmydocs-bucket-prod.storage.googleapis.com/.../d53ec931-3779-41be-a220-3fe4da2770c5.jpg?...", "html": "<h1>Large Language Model based Multi-Agents: A Survey of Progress and Challenges</h1>", "markdown": "# Large Language Model based Multi-Agents: A Survey of Progress and Challenges\n\n" } ], "chunk_length": 11 }, { "segments": [ { "segment_id": "7bb38fc7-c1b3-4153-a3cc-116c0b9caa0a", "bbox": { "left": 432.49997, "top": 474.16666, "width": 1659.9999, "height": 122.49999 }, "page_number": 1, "page_width": 2550.0, "page_height": 3300.0, "content": "Taicheng Guo 1 , Xiuying Chen 2 , Yaqi Wang 3 \u2217 , Ruidi Chang , Shichao Pei 4 , Nitesh V. Chawla 1 , Olaf Wiest 1 , Xiangliang Zhang 1 \u2020", "segment_type": "Text", "ocr": null, "image": "https://chunkmydocs-bucket-prod.storage.googleapis.com/.../7bb38fc7-c1b3-4153-a3cc-116c0b9caa0a.jpg?...", "html": "<p>Taicheng Guo 1 , Xiuying Chen 2 , Yaqi Wang 3 \u2217 , Ruidi Chang , Shichao Pei 4 , Nitesh V. Chawla 1 , Olaf Wiest 1 , Xiangliang Zhang 1 \u2020</p>", "markdown": "Taicheng Guo 1 , Xiuying Chen 2 , Yaqi Wang 3 \u2217 , Ruidi Chang , Shichao Pei 4 , Nitesh V. Chawla 1 , Olaf Wiest 1 , Xiangliang Zhang 1 \u2020\n\n" } ], "chunk_length": 100 } ] } }