Your current environment
from PIL import Image
from transformers import AutoProcessor
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
MODEL_PATH = '/workspace/mnt/storage/trt-llama/Qwen2-VL-7B-Instruct'
IMAGE_PATH = '/workspace/mnt/storage/llm_storge/vllm/examples/demo.jpeg'
llm = LLM(
model=MODEL_PATH,
dtype = 'float32',
limit_mm_per_prompt={'image': 10, 'video': 10},
)
sampling_params = SamplingParams(
temperature=0.1, top_p=0.001, repetition_penalty=1.05, max_tokens=256,
stop_token_ids=[],
)
messages = [
{'role': 'system', 'content': 'You are a helpful assistant.'},
{'role': 'user', 'content': [
{
'type': 'image',
'image': IMAGE_PATH,
'max_pixels': 12845056,
},
{
'type': 'text',
'text': '输出击掌的检测框',
},
]},
]
processor = AutoProcessor.from_pretrained(MODEL_PATH)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True,
)
image_inputs, video_inputs = process_vision_info(messages)
mm_data = {}
if image_inputs is not None:
mm_data['image'] = image_inputs
if video_inputs is not None:
mm_data['video'] = video_inputs
llm_inputs = {
'prompt': prompt,
'multi_modal_data': mm_data,
}
#击掌(529,516),(583,594)
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)
Model Input Dumps
No response
🐛 Describe the bug
Qwen2-VL-7B-Instruct:vllm-qwenvl-fp16 have a bug, The accuracy between vllm-qwenvl and transformer-qwenvl differs.
击掌(529,513),(584,605) vllm-fp16
击掌(531,516),(581,596) transformers-qwem2-vl-fp16
The coordinates of vllm are (529,513),(584,605).
The coordinates of transformers are (536,509),(588,602).
There is a significant difference in their errors.


Before submitting a new issue...
- Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.
Your current environment
from PIL import Image
from transformers import AutoProcessor
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
MODEL_PATH = '/workspace/mnt/storage/trt-llama/Qwen2-VL-7B-Instruct'
IMAGE_PATH = '/workspace/mnt/storage/llm_storge/vllm/examples/demo.jpeg'
llm = LLM(
model=MODEL_PATH,
dtype = 'float32',
limit_mm_per_prompt={'image': 10, 'video': 10},
)
sampling_params = SamplingParams(
temperature=0.1, top_p=0.001, repetition_penalty=1.05, max_tokens=256,
stop_token_ids=[],
)
messages = [
{'role': 'system', 'content': 'You are a helpful assistant.'},
{'role': 'user', 'content': [
{
'type': 'image',
'image': IMAGE_PATH,
'max_pixels': 12845056,
},
{
'type': 'text',
'text': '输出击掌的检测框',
},
]},
]
processor = AutoProcessor.from_pretrained(MODEL_PATH)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True,
)
image_inputs, video_inputs = process_vision_info(messages)
mm_data = {}
if image_inputs is not None:
mm_data['image'] = image_inputs
if video_inputs is not None:
mm_data['video'] = video_inputs
llm_inputs = {
'prompt': prompt,
'multi_modal_data': mm_data,
}
#击掌(529,516),(583,594)
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)
Model Input Dumps
No response
🐛 Describe the bug
Qwen2-VL-7B-Instruct:vllm-qwenvl-fp16 have a bug, The accuracy between vllm-qwenvl and transformer-qwenvl differs.
击掌(529,513),(584,605) vllm-fp16
击掌(531,516),(581,596) transformers-qwem2-vl-fp16
The coordinates of vllm are (529,513),(584,605).
The coordinates of transformers are (536,509),(588,602).
There is a significant difference in their errors.
Before submitting a new issue...