For llama cloud parser, can the API endpoint deal with batch job? #14780

timothylimyl · 2024-07-16T09:50:34Z

timothylimyl
Jul 16, 2024

Currently, based on the endpoint given on your documentation: https://docs.cloud.llamaindex.ai/llamaparse/getting_started/api

as per dosu bot reply to handle batch job

import os
import requests
import time
import threading
from functools import wraps

API_KEY = 'llx-pleasedonotrobme'
BASE_URL = 'https://api.cloud.llamaindex.ai/api/parsing'

def rate_limit(*, allowed_per_second: int):
    max_period = 1.0 / allowed_per_second
    last_call = [time.perf_counter()]
    lock = threading.Lock()

    def decorate(func):
        @wraps(func)
        def limit(*args, **kwargs):
            with lock:
                elapsed = time.perf_counter() - last_call[0]
                hold = max_period - elapsed
                if hold > 0:
                    time.sleep(hold)
                result = func(*args, **kwargs)
                last_call[0] = time.perf_counter()
            return result

        return limit

    return decorate

@rate_limit(allowed_per_second=15)
def upload_file(file_path):
    url = f'{BASE_URL}/upload'
    headers = {
        'accept': 'application/json',
        'Authorization': f'Bearer {API_KEY}',
    }

    files = {
        'file': (file_path, open(file_path, 'rb'), 'application/pdf')
    }

    response = requests.post(url, headers=headers, files=files)
    try:
        if response.status_code == 200:
            print('File uploaded successfully')
            return response.json()
        else:
            print('Failed to upload file')
            print('Status code:', response.status_code)
            print('Response:', response.text)
    finally:
        for file_entry in files:
            file_entry[1][1].close()

def check_parsing_job_status(job_id):
    url = f'{BASE_URL}/job/{job_id}'
    headers = {
        'accept': 'application/json',
        'Authorization': f'Bearer {API_KEY}'
    }
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print('Failed to retrieve job status')
        print('Status code:', response.status_code)
        print('Response:', response.text)

def get_parsing_job_result_in_markdown(job_id):
    url = f'{BASE_URL}/job/{job_id}/result/markdown'
    headers = {
        'accept': 'application/json',
        'Authorization': f'Bearer {API_KEY}'
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print('Failed to retrieve job result in Markdown')
        print('Status code:', response.status_code)
        print('Response:', response.text)

if __name__ == '__main__':
    directory_path = '/home/Obama/Downloads/test_pdf'
    file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path)]
    
    job_ids = []
    for file_path in file_paths:
        status = upload_file(file_path)
        if status:
            job_ids.append(status.get("id", None))
    
    for job_id in job_ids:
        while True:
            job_status = check_parsing_job_status(job_id)
            if job_status and job_status["status"] == "SUCCESS":
                result = get_parsing_job_result_in_markdown(job_id)
                markdown_text = result.get("markdown", None)
                with open(f'{file_path}.md', 'w') as f:
                    f.write(markdown_text)
                break
            time.sleep(5)

@dosu · 2024-07-16T10:18:29Z

dosubot[bot]
bot Jul 16, 2024

Yes, the Llama Cloud Parser API can handle batch jobs. You can modify your script to process multiple documents simultaneously by using the load_data method with a list of file paths. Here is an example of how you can achieve this:

import os
import requests
import time
import threading
from functools import wraps

API_KEY = 'llx-pleasedonotrobme'
BASE_URL = 'https://api.cloud.llamaindex.ai/api/parsing'

def rate_limit(*, allowed_per_second: int):
    max_period = 1.0 / allowed_per_second
    last_call = [time.perf_counter()]
    lock = threading.Lock()

    def decorate(func):
        @wraps(func)
        def limit(*args, **kwargs):
            with lock:
                elapsed = time.perf_counter() - last_call[0]
                hold = max_period - elapsed
                if hold > 0:
                    time.sleep(hold)
                result = func(*args, **kwargs)
                last_call[0] = time.perf_counter()
            return result

        return limit

    return decorate

@rate_limit(allowed_per_second=15)
def upload_file(file_path):
    url = f'{BASE_URL}/upload'
    headers = {
        'accept': 'application/json',
        'Authorization': f'Bearer {API_KEY}',
    }

    files = {
        'file': (file_path, open(file_path, 'rb'), 'application/pdf')
    }

    response = requests.post(url, headers=headers, files=files)
    try:
        if response.status_code == 200:
            print('File uploaded successfully')
            return response.json()
        else:
            print('Failed to upload file')
            print('Status code:', response.status_code)
            print('Response:', response.text)
    finally:
        for file_entry in files:
            file_entry[1][1].close()

def check_parsing_job_status(job_id):
    url = f'{BASE_URL}/job/{job_id}'
    headers = {
        'accept': 'application/json',
        'Authorization': f'Bearer {API_KEY}'
    }
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print('Failed to retrieve job status')
        print('Status code:', response.status_code)
        print('Response:', response.text)

def get_parsing_job_result_in_markdown(job_id):
    url = f'{BASE_URL}/job/{job_id}/result/markdown'
    headers = {
        'accept': 'application/json',
        'Authorization': f'Bearer {API_KEY}'
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print('Failed to retrieve job result in Markdown')
        print('Status code:', response.status_code)
        print('Response:', response.text)

if __name__ == '__main__':
    directory_path = '/home/Obama/Downloads/test_pdf'
    file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path)]
    
    job_ids = []
    for file_path in file_paths:
        status = upload_file(file_path)
        if status:
            job_ids.append(status.get("id", None))
    
    for job_id in job_ids:
        while True:
            job_status = check_parsing_job_status(job_id)
            if job_status and job_status["status"] == "SUCCESS":
                result = get_parsing_job_result_in_markdown(job_id)
                markdown_text = result.get("markdown", None)
                with open(f'{file_path}.md', 'w') as f:
                    f.write(markdown_text)
                break
            time.sleep(5)

This script includes a rate-limiting decorator to manage the API's rate limits and processes multiple files in a batch. The upload_file function is decorated to limit the number of requests per second, and the script waits for each job to complete before retrieving the results ^[1]^[2].

To continue talking to Dosu, mention @dosu.

4 replies

timothylimyl Jul 17, 2024
Author

@dosu

    job_ids = []
    for file_path in file_paths:
        status = upload_file(file_path)
        if status:
            job_ids.append(status.get("id", None))
    
    for job_id in job_ids:
        while True:
            job_status = check_parsing_job_status(job_id)
            if job_status and job_status["status"] == "SUCCESS":
                result = get_parsing_job_result_in_markdown(job_id)
                markdown_text = result.get("markdown", None)
                with open(f'{file_path}.md', 'w') as f:
                    f.write(markdown_text)
                break
            time.sleep(5)

Issue is that you lost traceability of the document. the job id never returns which document the markdown come from. you want to individually save the processed text as per the document name it is referring to.

dosubot[bot] bot Jul 17, 2024

To maintain traceability and save each processed text with the corresponding document name, you can modify your script to include the document name in the metadata and use it when saving the results. Here's an updated version of your script:

import os
import requests
import time
import threading
from functools import wraps

API_KEY = 'llx-pleasedonotrobme'
BASE_URL = 'https://api.cloud.llamaindex.ai/api/parsing'

def rate_limit(*, allowed_per_second: int):
    max_period = 1.0 / allowed_per_second
    last_call = [time.perf_counter()]
    lock = threading.Lock()

    def decorate(func):
        @wraps(func)
        def limit(*args, **kwargs):
            with lock:
                elapsed = time.perf_counter() - last_call[0]
                hold = max_period - elapsed
                if hold > 0:
                    time.sleep(hold)
                result = func(*args, **kwargs)
                last_call[0] = time.perf_counter()
            return result

        return limit

    return decorate

@rate_limit(allowed_per_second=15)
def upload_file(file_path):
    url = f'{BASE_URL}/upload'
    headers = {
        'accept': 'application/json',
        'Authorization': f'Bearer {API_KEY}',
    }

    files = {
        'file': (file_path, open(file_path, 'rb'), 'application/pdf')
    }

    response = requests.post(url, headers=headers, files=files)
    try:
        if response.status_code == 200:
            print('File uploaded successfully')
            return response.json()
        else:
            print('Failed to upload file')
            print('Status code:', response.status_code)
            print('Response:', response.text)
    finally:
        for file_entry in files:
            file_entry[1][1].close()

def check_parsing_job_status(job_id):
    url = f'{BASE_URL}/job/{job_id}'
    headers = {
        'accept': 'application/json',
        'Authorization': f'Bearer {API_KEY}'
    }
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print('Failed to retrieve job status')
        print('Status code:', response.status_code)
        print('Response:', response.text)

def get_parsing_job_result_in_markdown(job_id):
    url = f'{BASE_URL}/job/{job_id}/result/markdown'
    headers = {
        'accept': 'application/json',
        'Authorization': f'Bearer {API_KEY}'
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print('Failed to retrieve job result in Markdown')
        print('Status code:', response.status_code)
        print('Response:', response.text)

if __name__ == '__main__':
    directory_path = '/home/Obama/Downloads/test_pdf'
    file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path)]
    
    job_id_to_file_path = {}
    for file_path in file_paths:
        status = upload_file(file_path)
        if status:
            job_id = status.get("id", None)
            if job_id:
                job_id_to_file_path[job_id] = file_path
    
    for job_id, file_path in job_id_to_file_path.items():
        while True:
            job_status = check_parsing_job_status(job_id)
            if job_status and job_status["status"] == "SUCCESS":
                result = get_parsing_job_result_in_markdown(job_id)
                markdown_text = result.get("markdown", None)
                with open(f'{file_path}.md', 'w') as f:
                    f.write(markdown_text)
                break
            time.sleep(5)

In this updated script:

A dictionary job_id_to_file_path is used to map job IDs to their corresponding file paths.
When a job ID is obtained after uploading a file, it is stored in the dictionary along with the file path.
When retrieving the parsing job result, the script uses the dictionary to get the original file path and save the result with the corresponding document name.

To continue talking to Dosu, mention @dosu.

timothylimyl Jul 18, 2024
Author

is there any way to get the markdown for each page instead of an entire markdown?

@dosu

dosubot[bot] bot Jul 18, 2024

Yes, you can get the Markdown content for each page of a document instead of the entire Markdown content for the whole document. The MarkdownElementNodeParser class in the LlamaIndex codebase is designed to split a document into multiple nodes based on custom Markdown splitting logic.

Here is the relevant code snippet from the MarkdownElementNodeParser class:

def get_nodes_from_documents(self, documents: List[Document]) -> List[BaseNode]:
    """Get nodes from documents."""
    nodes = []
    for document in documents:
        text = document.text
        markdown_nodes = []
        lines = text.split("\n")
        metadata: Dict[str, str] = {}
        code_block = False
        current_section = ""

        for line in lines:
            if line.lstrip().startswith("```"):
                code_block = not code_block
            header_match = re.match(r"^(#+)\s(.*)", line)
            if header_match and not code_block:
                if current_section != "":
                    markdown_nodes.append(
                        self._build_node_from_split(
                            current_section.strip(), document, metadata
                        )
                    )
                metadata = self._update_metadata(
                    metadata, header_match.group(2), len(header_match.group(1).strip())
                )
                current_section = f"{header_match.group(2)}\n"
            else:
                current_section += line + "\n"

        markdown_nodes.append(
            self._build_node_from_split(current_section.strip(), document, metadata)
        )

        nodes.extend(markdown_nodes)

    return nodes

This method processes the document line by line, splitting it into sections based on headers and code blocks, and then builds individual nodes for each section. This way, you can get the Markdown content for each page or section of the document ^[1]^[2]^[3].

To continue talking to Dosu, mention @dosu.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

For llama cloud parser, can the API endpoint deal with batch job? #14780

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 1 comment 4 replies

{{title}}

{{title}}

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

Select a reply

For llama cloud parser, can the API endpoint deal with batch job? #14780

timothylimyl Jul 16, 2024

Replies: 1 comment · 4 replies

dosubot[bot] bot Jul 16, 2024

timothylimyl Jul 17, 2024 Author

dosubot[bot] bot Jul 17, 2024

timothylimyl Jul 18, 2024 Author

dosubot[bot] bot Jul 18, 2024

timothylimyl
Jul 16, 2024

Replies: 1 comment 4 replies

dosubot[bot]
bot Jul 16, 2024

timothylimyl Jul 17, 2024
Author

timothylimyl Jul 18, 2024
Author