From 513f8d5fb28a192880bd086a59ebebef168c600a Mon Sep 17 00:00:00 2001 From: Haodong Duan Date: Sun, 17 Mar 2024 20:45:02 +0800 Subject: [PATCH 1/3] [API] Add Step-1V API (#117) * add step1v api * update stepai * update README --- README.md | 5 ++- vlmeval/api/__init__.py | 3 +- vlmeval/api/stepai.py | 98 +++++++++++++++++++++++++++++++++++++++++ vlmeval/config.py | 3 +- 4 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 vlmeval/api/stepai.py diff --git a/README.md b/README.md index e9fe1a1a0..82e7ce650 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ ## πŸ†• News +- **[2024-03-17]** We have added an API wrapper for [**Step-1V**](https://www.stepfun.com/#step1v) πŸ”₯πŸ”₯πŸ”₯ - **[2024-03-15]** We have updated to be compatible with the latest version of LLaVA. All LLaVA series models have been re-evaluated with temperature=0, and the new results have been updated to the leaderboard πŸ”₯πŸ”₯πŸ”₯ - **[2024-02-27]** We have fixed the evaluation results of [**Yi-VL-34B**](https://huggingface.co/01-ai/Yi-VL-34B), check the updated results [**here**](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard) πŸ”₯πŸ”₯πŸ”₯ - **[2024-02-25]** We have supported [**OCRBench**](https://github.com/Yuliang-Liu/MultimodalOCR). πŸ”₯πŸ”₯πŸ”₯ @@ -63,8 +64,8 @@ **Supported API Models** -| [**GPT-4-Vision-Preview**](https://platform.openai.com/docs/guides/vision)πŸŽžοΈπŸš… | [**GeminiProVision**](https://platform.openai.com/docs/guides/vision)πŸŽžοΈπŸš… | [**QwenVLPlus**](https://huggingface.co/spaces/Qwen/Qwen-VL-Plus)πŸŽžοΈπŸš… | [**QwenVLMax**](https://huggingface.co/spaces/Qwen/Qwen-VL-Max)πŸŽžοΈπŸš… | -| ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| [**GPT-4-Vision-Preview**](https://platform.openai.com/docs/guides/vision)πŸŽžοΈπŸš… | [**GeminiProVision**](https://platform.openai.com/docs/guides/vision)πŸŽžοΈπŸš… | [**QwenVLPlus**](https://huggingface.co/spaces/Qwen/Qwen-VL-Plus)πŸŽžοΈπŸš… | [**QwenVLMax**](https://huggingface.co/spaces/Qwen/Qwen-VL-Max)πŸŽžοΈπŸš… | [**Step-1V**](https://www.stepfun.com/#step1v)πŸŽžοΈπŸš… | +| ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------ | **Supported PyTorch / HF Models** diff --git a/vlmeval/api/__init__.py b/vlmeval/api/__init__.py index 29dbab0a8..4de60ea1a 100644 --- a/vlmeval/api/__init__.py +++ b/vlmeval/api/__init__.py @@ -4,9 +4,10 @@ from .gemini import GeminiWrapper, GeminiProVision from .qwen_vl_api import QwenVLWrapper, QwenVLAPI from .qwen_api import QwenAPI +from .stepai import Step1V __all__ = [ 'OpenAIWrapper', 'HFChatModel', 'OpenAIWrapperInternal', 'GeminiWrapper', 'GPT4V', 'GPT4V_Internal', 'GeminiProVision','QwenVLWrapper', 'QwenVLAPI', - 'QwenAPI' + 'QwenAPI', 'Step1V' ] \ No newline at end of file diff --git a/vlmeval/api/stepai.py b/vlmeval/api/stepai.py new file mode 100644 index 000000000..f0d0c517d --- /dev/null +++ b/vlmeval/api/stepai.py @@ -0,0 +1,98 @@ +from vlmeval.smp import * +from vlmeval.api.base import BaseAPI + +url = "https://b-openapi.basemind.com/openapi/v1/chat/completions" +headers = { + 'X-Request-Orgcode': 'companyA', + 'Authorization': 'Bearer {}', + 'Content-Type': 'application/json' +} + +def convert_image_to_base64(image_path): + with open(image_path, "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()).decode() + return encoded_string + +class StepAPI(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'stepapi-rankboard', + retry: int = 10, + wait: int = 3, + key: str = None, + temperature: float = 0, + max_tokens: int = 300, + verbose: bool = True, + system_prompt: str = None, + **kwargs): + self.model = model + self.fail_msg = 'Fail to obtain answer via API.' + self.headers = headers + self.temperature = temperature + self.max_tokens = max_tokens + self.system_prompt = system_prompt + if key is not None: + self.key = key + else: + self.key = os.environ.get('STEPAI_API_KEY', '') + headers['Authorization'] = headers['Authorization'].format(self.key) + + super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) + + @staticmethod + def build_msgs(msgs_raw): + messages = [] + message = {"role": "user", "content": []} + + for msg in msgs_raw: + if isimg(msg): + image_b64 = convert_image_to_base64(msg) + message['content'].append({ + "image_b64": {'b64_json': image_b64}, + "type": "image_b64" + }) + else: + message['content'].append({ + 'text': msg, + "type": 'text' + }) + + messages.append(message) + return messages + + def generate_inner(self, inputs, **kwargs) -> str: + print(inputs, '\n') + payload = dict( + model=self.model, + max_tokens=self.max_tokens, + temperature=self.temperature, + messages= self.build_msgs(msgs_raw=inputs), #ιœ€θ¦ζž„ε»Ίmessage + **kwargs) + response = requests.post(url, headers=headers, data=json.dumps(payload)) + # print('response is here!!:',response.text,'\n') + ret_code = response.status_code + # print('ret_code is:',ret_code) + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + + answer = self.fail_msg + # print('initial answer is',answer) + try: + resp_struct = json.loads(response.text) + # print('resp_struct is',resp_struct) + answer = resp_struct['choices'][0]['message']['content'].strip() + # print('answer!!!!!!=========',answer,'\n') + except: + pass + # print('finial answer is',answer) + return ret_code, answer, response + + +class Step1V(StepAPI): + + def generate(self, image_path, prompt, dataset=None): + return super(StepAPI, self).generate([image_path, prompt]) + + def interleave_generate(self, ti_list, dataset=None): + return super(StepAPI, self).generate(ti_list) \ No newline at end of file diff --git a/vlmeval/config.py b/vlmeval/config.py index b1fa26ddf..bc7e9458f 100644 --- a/vlmeval/config.py +++ b/vlmeval/config.py @@ -1,5 +1,5 @@ from .vlm import * -from .api import GPT4V, GeminiProVision, GPT4V_Internal, QwenVLAPI +from .api import * from functools import partial PandaGPT_ROOT = None @@ -63,6 +63,7 @@ 'GeminiProVision': partial(GeminiProVision, temperature=0, retry=10), 'QwenVLPlus': partial(QwenVLAPI, model='qwen-vl-plus', temperature=0, retry=10), 'QwenVLMax': partial(QwenVLAPI, model='qwen-vl-max', temperature=0, retry=10), + 'Step1V': partial(Step1V, temperature=0, retry=10), } xtuner_models = { From b2645dbec7b183e5904daf92ae32da26e2ab198a Mon Sep 17 00:00:00 2001 From: YuZhiyin <148198020+YuZhiyin@users.noreply.github.com> Date: Tue, 19 Mar 2024 14:00:33 +0800 Subject: [PATCH 2/3] add claude3-vision (#118) * add claude3-vision * add claudeAPI --- vlmeval/api/__init__.py | 3 +- vlmeval/api/claude.py | 102 ++++++++++++++++++++++++++++++++++++++++ vlmeval/config.py | 1 + 3 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 vlmeval/api/claude.py diff --git a/vlmeval/api/__init__.py b/vlmeval/api/__init__.py index 4de60ea1a..6982e1248 100644 --- a/vlmeval/api/__init__.py +++ b/vlmeval/api/__init__.py @@ -5,9 +5,10 @@ from .qwen_vl_api import QwenVLWrapper, QwenVLAPI from .qwen_api import QwenAPI from .stepai import Step1V +from .claude import Claude_Wrapper,Claude3V __all__ = [ 'OpenAIWrapper', 'HFChatModel', 'OpenAIWrapperInternal', 'GeminiWrapper', 'GPT4V', 'GPT4V_Internal', 'GeminiProVision','QwenVLWrapper', 'QwenVLAPI', - 'QwenAPI', 'Step1V' + 'QwenAPI', 'Step1V','Claude3V','Claude_Wrapper' ] \ No newline at end of file diff --git a/vlmeval/api/claude.py b/vlmeval/api/claude.py new file mode 100644 index 000000000..7ee4b44cc --- /dev/null +++ b/vlmeval/api/claude.py @@ -0,0 +1,102 @@ +from vlmeval.smp import * +from vlmeval.api.base import BaseAPI +from time import sleep +import base64 + +url = "https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat" +headers = { + 'alles-apin-token': '', + 'Content-Type': 'application/json' +} + +class Claude_Wrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'claude-3-opus-20240229', + key: str = None, + retry: int = 10, + wait: int = 3, + system_prompt: str = None, + verbose: bool = True, + temperature: float = 0, + max_tokens: int = 1024, + **kwargs): + + self.model = model + self.headers = headers + self.temperature = temperature + self.max_tokens = max_tokens + if key is not None: + self.key = key + else: + self.key = os.environ.get('ALLES', '') + self.headers['alles-apin-token'] = self.key + + super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) + + @staticmethod + def build_msgs(msgs_raw): + messages = [] + message = {"role": "user", "content": []} + for msg in msgs_raw: + if isimg(msg): + media_type_map = { + 'jpg': 'image/jpeg', + 'jpeg': 'image/jpeg', + 'png': 'image/png', + 'gif': 'image/gif', + 'webp': 'iamge/webp' + } + media_type = media_type_map[msg.split('.')[-1].lower()] + with open(msg, "rb") as file: + image_data = base64.b64encode(file.read()).decode("utf-8") + item = { + 'type': 'image', + 'source': {'type': 'base64', 'media_type': media_type, 'data': image_data} + } + + else: + item = {'type': 'text', 'text': msg} + message['content'].append(item) + messages.append(message) + return messages + + def generate_inner(self, inputs, **kwargs) -> str: + + payload = json.dumps({ + "model": self.model, + "max_tokens": self.max_tokens, + "messages": self.build_msgs(msgs_raw=inputs), + **kwargs + }) + response = requests.request("POST", url, headers=headers, data=payload) + + ret_code = response.status_code + retry = self.retry + while ret_code == 429 and retry > 0: + sleep(15) + response = requests.request("POST", url, headers=headers, data=payload) + ret_code = response.status_code + retry -= 1 + + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + + try: + resp_struct = json.loads(response.text) + answer = resp_struct['data']['content'][0]['text'].strip() + except: + pass + + return ret_code, answer, response + + +class Claude3V(Claude_Wrapper): + + def generate(self, image_path, prompt, dataset=None): + return super(Claude_Wrapper, self).generate([image_path, prompt]) + + def interleave_generate(self, ti_list, dataset=None): + return super(Claude_Wrapper, self).generate(ti_list) \ No newline at end of file diff --git a/vlmeval/config.py b/vlmeval/config.py index bc7e9458f..727d9c1fa 100644 --- a/vlmeval/config.py +++ b/vlmeval/config.py @@ -49,6 +49,7 @@ 'InternVL-Chat-V1-1':partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-1'), 'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2'), 'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus'), + 'Claude3V':partial(Claude3V,model='claude-3-opus-20240229',temperature=0,retry=10), } api_models = { From b35ab5bd754af63cb5d49bdf08d7c7f83715099e Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Tue, 19 Mar 2024 14:06:13 +0800 Subject: [PATCH 3/3] update config for api models --- vlmeval/config.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vlmeval/config.py b/vlmeval/config.py index 727d9c1fa..28288d868 100644 --- a/vlmeval/config.py +++ b/vlmeval/config.py @@ -49,22 +49,28 @@ 'InternVL-Chat-V1-1':partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-1'), 'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2'), 'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus'), - 'Claude3V':partial(Claude3V,model='claude-3-opus-20240229',temperature=0,retry=10), } api_models = { 'GPT4V': partial(GPT4V, model='gpt-4-vision-preview', temperature=0, img_size=512, img_detail='low', retry=10), + # Internal Only 'GPT4V_INT': partial(GPT4V_Internal, model='gpt-4-vision-preview', temperature=0, img_size=512, img_detail='low', retry=10), 'GPT4V_SHORT': partial( GPT4V, model='gpt-4-vision-preview', temperature=0, img_size=512, img_detail='low', retry=10, system_prompt="Please responde to the following question / request in a short reply. "), + # Internal Only 'GPT4V_SHORT_INT': partial( GPT4V_Internal, model='gpt-4-vision-preview', temperature=0, img_size=512, img_detail='low', retry=10, system_prompt="Please responde to the following question / request in a short reply. "), 'GeminiProVision': partial(GeminiProVision, temperature=0, retry=10), 'QwenVLPlus': partial(QwenVLAPI, model='qwen-vl-plus', temperature=0, retry=10), 'QwenVLMax': partial(QwenVLAPI, model='qwen-vl-max', temperature=0, retry=10), + # Internal Only 'Step1V': partial(Step1V, temperature=0, retry=10), + # Internal Only + 'Claude3V_Opus': partial(Claude3V, model='claude-3-opus-20240229', temperature=0, retry=10), + 'Claude3V_Sonnet': partial(Claude3V, model='claude-3-sonnet-20240229', temperature=0, retry=10), + 'Claude3V_Haiku': partial(Claude3V, model='claude-3-haiku-20240307', temperature=0, retry=10), } xtuner_models = {