The Qwen-VL models can answer question based on the images you provide.
Go to Playground of the Model Studio console to experience the image understanding capability online.
How to use
You must first obtain an API key and set the API key as an environment variable. If you need to use OpenAI SDK or DashScope SDK, you must install the SDK.
Simple examples
OpenAI compatible
You can call the Qwen-VL models using the OpenAI SDK or OpenAI-compatible HTTP method.
Python
Sample code
from openai import OpenAI
import os
client = OpenAI(
api_key=os.getenv("DASHSCOPE_API_KEY"),
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
)
completion = client.chat.completions.create(
model="qwen-vl-max",
messages=[
{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"}},
{"type": "text", "text": "What is this"}
]}
]
)
print(completion.choices[0].message.content)
Sample response
This is a photo taken on the beach. In the photo, a person and a dog are sitting on the sand, with the sea and sky in the background. The person and the dog seem to be interacting, with the dog's front paw resting on the person's hand. Sunlight is shining from the right side of the picture, adding a warm atmosphere to the entire scene.
cURL
Sample code
curl --location 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '{
"model": "qwen-vl-max",
"messages": [{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"}},
{"type": "text", "text": "What is this"}
]
}]
}'
Sample response
{
"choices": [
{
"message": {
"content": "This image shows a lady and a dog interacting on the beach. The lady is sitting on the sand, smiling and shaking hands with the dog. The background is the sea and sky, with sunlight shining on them, creating a warm atmosphere. The dog is wearing a collar and looks very gentle.",
"role": "assistant"
},
"finish_reason": "stop",
"index": 0,
"logprobs": null
}
],
"object": "chat.completion",
"usage": {
"prompt_tokens": 1270,
"completion_tokens": 54,
"total_tokens": 1324
},
"created": 1725948561,
"system_fingerprint": null,
"model": "qwen-vl-max",
"id": "chatcmpl-0fd66f46-b09e-9164-a84f-3ebbbedbac15"
}
Node.js
Sample code
import OpenAI from "openai";
const openai = new OpenAI(
{
// If the environment variable is not configured, please replace the following line with: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
}
);
async function main() {
const response = await openai.chat.completions.create({
model: "qwen-vl-max",
messages: [{role: "user",content: [
{ type: "image_url",image_url: {"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"}},
{ type: "text", text: "What is this?" }
]}]
});
console.log(response.choices[0].message.content);
}
main()
Sample response
This is a photo taken on the beach. In the photo, a woman in a plaid shirt is sitting on the sand, interacting with a yellow Labrador wearing a collar. The background is the sea and sky, with sunlight shining on them, creating a warm atmosphere.
DashScope
You can call the Qwen-VL models using the DashScope SDK or HTTP method.
Python
Sample code
import os
import dashscope
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role": "user",
"content": [
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"},
{"text": "What is this?"}
]
}
]
response = dashscope.MultiModalConversation.call(
# If the environment variable is not configured, please replace the following line with: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-max',
messages=messages
)
print(response.output.choices[0].message.content[0]["text"])
Sample response
This is a photo taken on the beach. In the photo, there is a lady and a dog. The lady is sitting on the sand, smiling and interacting with the dog. The dog is wearing a collar and seems to be shaking hands with the lady. The background is the sea and sky, with sunlight shining on them, creating a warm atmosphere.
Java
Sample code
import java.util.Arrays;
import java.util.Collections;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.JsonUtils;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
Collections.singletonMap("image", "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"),
Collections.singletonMap("text", "What is this?"))).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.model("qwen-vl-max")
.message(userMessage)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
Sample response
This is a photo taken on the beach. In the photo, there is a person in a plaid shirt and a dog wearing a collar. The person and the dog are sitting face to face, seemingly interacting. The background is the sea and sky, with sunlight shining on them, creating a warm atmosphere.
cURL
Sample code
curl -X POST https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H 'Content-Type: application/json' \
-d '{
"model": "qwen-vl-max",
"input":{
"messages":[
{
"role": "user",
"content": [
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"},
{"text": "What is this?"}
]
}
]
}
}'
Sample response
{
"output": {
"choices": [
{
"finish_reason": "stop",
"message": {
"role": "assistant",
"content": [
{
"text": "This is a photo taken on the beach. In the photo, there is a person in a plaid shirt and a dog wearing a collar. They are sitting on the sand, with the sea and sky in the background. Sunlight is shining from the right side of the picture, adding a warm atmosphere to the entire scene."
}
]
}
}
]
},
"usage": {
"output_tokens": 55,
"input_tokens": 1271,
"image_tokens": 1247
},
"request_id": "ccf845a3-dc33-9cda-b581-20fe7dc23f70"
}
Multiple image input
The Qwen-VL model supports multiple images in a single request. Sample code:
OpenAI compatible
You can call the Qwen-VL models using the OpenAI SDK or OpenAI-compatible HTTP method.
Python
Sample code
import os
from openai import OpenAI
client = OpenAI(
api_key=os.getenv("DASHSCOPE_API_KEY"),
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-max",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"
},
},
{
"type": "image_url",
"image_url": {
"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png"
},
},
{"type": "text", "text": "What are these"},
],
}
],
)
print(completion.choices[0].message.content)
Sample response
In Image 1, there is a scene of a lady and a Labrador interacting on the beach. The lady is wearing a plaid shirt, sitting on the sand, shaking hands with the dog. The background is waves and sky, and the whole picture is filled with a warm and pleasant atmosphere.
In Image 2, there is a scene of a tiger walking in the forest. The tiger's fur is orange with black stripes, and it is stepping forward. The surroundings are dense trees and vegetation, with fallen leaves covering the ground, giving a wild and natural feeling.
cURL
Sample code
curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H 'Content-Type: application/json' \
-d '{
"model": "qwen-vl-max",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"
}
},
{
"type": "image_url",
"image_url": {
"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png"
}
},
{
"type": "text",
"text": "What are these"
}
]
}
]
}'
Sample response
{
"choices": [
{
"message": {
"content": "In Image 1, there is a scene of a lady and a Labrador interacting on the beach. The lady is wearing a plaid shirt, sitting on the sand, shaking hands with the dog. The background is the sea view and sunset sky, making the whole picture very warm and harmonious.\n\nIn Image 2, there is a scene of a tiger walking in the forest. The tiger's fur is orange with black stripes, and it is stepping forward. The surroundings are dense trees and vegetation, with fallen leaves covering the ground, giving a natural wildness and vitality.",
"role": "assistant"
},
"finish_reason": "stop",
"index": 0,
"logprobs": null
}
],
"object": "chat.completion",
"usage": {
"prompt_tokens": 2497,
"completion_tokens": 109,
"total_tokens": 2606
},
"created": 1725948561,
"system_fingerprint": null,
"model": "qwen-vl-max",
"id": "chatcmpl-0fd66f46-b09e-9164-a84f-3ebbbedbac15"
}
Node.js
Sample code
import OpenAI from "openai";
const openai = new OpenAI(
{
// If the environment variable is not configured, please replace the following line with: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
}
);
async function main() {
const response = await openai.chat.completions.create({
model: "qwen-vl-max",
messages: [{role: "user",content: [
{ type: "image_url",image_url: {"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"}},
{ type: "image_url",image_url: {"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png"}},
{ type: "text", text: "What are these?" },
]}]
});
console.log(response.choices[0].message.content);
}
main()
Sample response
In the first image, a person and a dog are interacting on the beach. The person is wearing a plaid shirt, and the dog is wearing a collar. They seem to be shaking hands or high-fiving.
In the second image, a tiger is walking in the forest. The tiger's fur is orange with black stripes, and the background is green trees and vegetation.
DashScope
You can call the Qwen-VL models using the DashScope SDK or HTTP method.
Python
Sample code
import os
import dashscope
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role": "user",
"content": [
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"},
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png"},
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/rabbit.png"},
{"text": "What are these?"}
]
}
]
response = dashscope.MultiModalConversation.call(
# If the environment variable is not configured, please replace the following line with: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-max',
messages=messages
)
print(response.output.choices[0].message.content[0]["text"])
Sample response
These images show some animals and natural scenes. In the first image, a person and a dog are interacting on the beach. In the second image, a tiger is walking in the forest. In the third image, a cartoon-style rabbit is hopping on the grass.
Java
Sample code
import java.util.Arrays;
import java.util.Collections;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
Collections.singletonMap("image", "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"),
Collections.singletonMap("image", "https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png"),
Collections.singletonMap("image", "https://dashscope.oss-cn-beijing.aliyuncs.com/images/rabbit.png"),
Collections.singletonMap("text", "What are these?"))).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.model("qwen-vl-max")
.message(userMessage)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text")); }
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
Sample response
These images show some animals and natural scenes.
1. First image: A woman and a dog interacting on the beach. The woman is wearing a plaid shirt, sitting on the sand, and the dog is wearing a collar, reaching out its paw to shake hands with the woman.
2. Second image: A tiger walking in the forest. The tiger's fur is orange with black stripes, and the background is trees and leaves.
3. Third image: A cartoon-style rabbit hopping on the grass. The rabbit is white, with pink ears, and the background is blue sky and yellow flowers.
These images showcase different animals and natural environments.
cURL
Sample code
curl --location 'https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '{
"model": "qwen-vl-plus",
"input":{
"messages":[
{
"role": "user",
"content": [
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"},
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png"},
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/rabbit.png"},
{"text": "What are these?"}
]
}
]
}
}'
Sample response
{
"output": {
"choices": [
{
"finish_reason": "stop",
"message": {
"role": "assistant",
"content": [
{
"text": "This image shows a lady and her dog on the beach. They seem to be enjoying each other's company, with the dog sitting on the sand, reaching out its paw to shake hands or interact with the lady. The background is a beautiful sunset scene, with waves gently lapping the shoreline.\n\nPlease note that the description I provide is based on the visible content in the image and does not include any information beyond the visual information. If you need more specific details about this scene, please let me know!"
}
]
}
}
]
},
"usage": {
"output_tokens": 81,
"input_tokens": 1277,
"image_tokens": 1247
},
"request_id": "ccf845a3-dc33-9cda-b581-20fe7dc23f70"
}
Multi-round conversation
The Qwen-VL models can reference conversation history when generating responses. Sample code:
OpenAI compatible
You can call the Qwen-VL models using the OpenAI SDK or OpenAI-compatible HTTP method.
cURL
Sample code
curl -X POST https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H 'Content-Type: application/json' \
-d '{
"model": "qwen-vl-max",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"
}
},
{
"type": "text",
"text": "What is this"
}
]
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "This is a girl and a dog."
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Write a poem describing this scene"
}
]
}
]
}'
Sample response
{
"choices": [
{
"message": {
"content": "Sea breeze gently caresses smiling faces, \nOn the beach with the dog to accompany. \nSunset casts short shadows, \nHappy times, heart intoxicated.",
"role": "assistant"
},
"finish_reason": "stop",
"index": 0,
"logprobs": null
}
],
"object": "chat.completion",
"usage": {
"prompt_tokens": 1295,
"completion_tokens": 32,
"total_tokens": 1327
},
"created": 1726324976,
"system_fingerprint": null,
"model": "qwen-vl-max",
"id": "chatcmpl-3c953977-6107-96c5-9a13-c01e328b24ca"
}
DashScope
You can call the Qwen-VL models using the DashScope SDK or HTTP method.
Python
Sample code
import os
from dashscope import MultiModalConversation
import dashscope
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role": "user",
"content": [
{
"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"
},
{"text": "What is this?"},
],
}
]
response = MultiModalConversation.call(
# If the environment variable is not configured, please replace the following line with: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-max',
messages=messages
)
print(f"Model first round output: {response.output.choices[0].message.content[0]['text']}")
messages.append(response['output']['choices'][0]['message'])
user_msg = {"role": "user", "content": [{"text": "Write a poem describing this scene"}]}
messages.append(user_msg)
response = MultiModalConversation.call(
# If the environment variable is not configured, please replace the following line with: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-max',
messages=messages
)
print(f"Model second round output: {response.output.choices[0].message.content[0]['text']}")
Sample response
Model first round output: This is a photo taken on the beach. In the photo, there is a person in a plaid shirt and a dog wearing a collar. The person and the dog are sitting face to face, seemingly interacting. The background is the sea and sky, with sunlight shining on them, creating a warm atmosphere.
Model second round output: On the sunlit beach, people and dogs share joyful moments.
Java
Sample code
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
private static final String modelName = "qwen-vl-max";
public static void MultiRoundConversationCall() throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
MultiModalMessage systemMessage = MultiModalMessage.builder().role(Role.SYSTEM.getValue())
.content(Arrays.asList(Collections.singletonMap("text", "You are a helpful assistant."))).build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(Collections.singletonMap("image", "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"),
Collections.singletonMap("text", "What is this?"))).build();
List<MultiModalMessage> messages = new ArrayList<>();
messages.add(systemMessage);
messages.add(userMessage);
MultiModalConversationParam param = MultiModalConversationParam.builder()
// If the environment variable is not configured, please replace the following line with: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY")) .model(modelName)
.messages(messages)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println("First round output: "+result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text")); // add the result to conversation
messages.add(result.getOutput().getChoices().get(0).getMessage());
MultiModalMessage msg = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(Collections.singletonMap("text", "Write a poem describing this scene"))).build();
messages.add(msg);
param.setMessages((List)messages);
result = conv.call(param);
System.out.println("Second round output: "+result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text")); }
public static void main(String[] args) {
try {
MultiRoundConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
Sample response
First round output: This is a photo taken on the beach. In the photo, there is a person in a plaid shirt and a dog wearing a collar. The person and the dog are sitting face to face, seemingly interacting. The background is the sea and sky, with sunlight shining on them, creating a warm atmosphere.
Second round output: On the sunlit beach, people and dogs share joyful moments.
cURL
Sample code
curl -X POST https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H 'Content-Type: application/json' \
-d '{
"model": "qwen-vl-max",
"input":{
"messages":[
{
"role": "user",
"content": [
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"},
{"text": "What is this?"}
]
},
{
"role": "assistant",
"content": [
{"text": "This is a dog and a girl."}
]
},
{
"role": "user",
"content": [
{"text": "Write a poem describing this scene"}
]
}
]
}
}'
Sample response
{
"output": {
"choices": [
{
"finish_reason": "stop",
"message": {
"role": "assistant",
"content": [
{
"text": "Waves gently lap the shore, girl and dog play together. Sunlight shines on smiling faces, happy times forever remembered."
}
]
}
}
]
},
"usage": {
"output_tokens": 27,
"input_tokens": 1298,
"image_tokens": 1247
},
"request_id": "bdf5ef59-c92e-92a6-9d69-a738ecee1590"
}
Streaming output
In streaming output mode, the model generates and returns intermediate results in real-time instead of one final response. This reduces the wait time for the complete response.
OpenAI compatible
Python
Sample code
from openai import OpenAI
import os
client = OpenAI(
# If the environment variable is not configured, please replace the following line with: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-max",
messages=[
{"role": "user",
"content": [{"type": "image_url",
"image_url": {"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"},},
{"type": "text", "text": "What is this"}]}],
stream=True
)
full_content = ""
print("Streaming output content:")
for chunk in completion:
if chunk.choices[0].delta.content is None:
continue
full_content += chunk.choices[0].delta.content
print(chunk.choices[0].delta.content)
print(f"Full content: {full_content}")
Sample response
Streaming output content:
This
is
a
photo
taken
on
the
beach
. In the photo,
a person and a dog
are sitting on the sand,
with the sea and
sky in the background. The person and
the dog seem to be interacting
, with the dog's front
paw resting on the person's
hand. Sunlight is shining from
the right side of the picture,
adding a warm atmosphere to the
entire scene.
Full content: This is a photo taken on the beach. In the photo, a person and a dog are sitting on the sand, with the sea and sky in the background. The person and the dog seem to be interacting, with the dog's front paw resting on the person's hand. Sunlight is shining from the right side of the picture, adding a warm atmosphere to the entire scene.
cURL
Sample code
curl --location 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '{
"model": "qwen-vl-plus",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"
}
},
{
"type": "text",
"text": "What is this"
}
]
}
],
"stream":true,
"stream_options":{"include_usage":true}
}'
Sample response
data: {"choices":[{"delta":{"content":"","role":"assistant"},"index":0,"logprobs":null,"finish_reason":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"finish_reason":null,"delta":{"content":"In"},"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"the"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"photo"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":", a person and a dog"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"are sitting on the sand,"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"with the sea and"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"sky in the background."},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"The person and"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"the dog seem to be interacting"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":", with the dog's front"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"paw resting on the person's"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"hand. Sunlight is shining from"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"the right side of the picture,"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"delta":{"content":"adding a warm atmosphere to the"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[{"finish_reason":"stop","delta":{"content":"entire scene."},"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: {"choices":[],"object":"chat.completion.chunk","usage":{"prompt_tokens":1276,"completion_tokens":85,"total_tokens":1361},"created":1721823635,"system_fingerprint":null,"model":"qwen-vl-plus","id":"chatcmpl-9a9ec75a-3109-9910-b79e-7bcbce81c8f9"}
data: [DONE]
Node.js
Sample code
import OpenAI from "openai";
const openai = new OpenAI(
{
// If the environment variable is not configured, please replace the following line with: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
}
);
const completion = await openai.chat.completions.create({
model: "qwen-vl-max",
messages: [
{"role": "user",
"content": [{"type": "image_url",
"image_url": {"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"},},
{"type": "text", "text": "What is this"}]}],
stream: true,
});
let fullContent = ""
console.log("Streaming output content:")
for await (const chunk of completion) {
if (chunk.choices[0].delta.content != null) {
fullContent += chunk.choices[0].delta.content;
console.log(chunk.choices[0].delta.content);
}
}
console.log(`Full output content: ${fullContent}`)
Sample response
Streaming output content:
This
is
a
photo
taken
on
the
beach
. In the photo,
a person and a dog
are sitting on the sand,
with the sea and
sky in the background. The person and
the dog seem to be interacting
, with the dog's front
paw resting on the person's
hand. Sunlight is shining from
the right side of the picture,
adding a warm atmosphere to the
entire scene.
Full output content: This is a photo taken on the beach. In the photo, a person and a dog are sitting on the sand, with the sea and sky in the background. The person and the dog seem to be interacting, with the dog's front paw resting on the person's hand. Sunlight is shining from the right side of the picture, adding a warm atmosphere to the entire scene.
DashScope
Python
Sample code
import os
from dashscope import MultiModalConversation
import dashscope
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'
messages = [
{
"role": "user",
"content": [
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"},
{"text": "What is this?"}
]
}
]
responses = MultiModalConversation.call(
# If the environment variable is not configured, please replace the following line with: api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
model='qwen-vl-max',
messages=messages,
stream=True,
incremental_output=True
)
full_content = ""
print("Streaming output content:")
for response in responses:
try:
print(response["output"]["choices"][0]["message"].content[0]["text"])
full_content += response["output"]["choices"][0]["message"].content[0]["text"]
except:
pass
print(f"Full content: {full_content}")
Sample response
Streaming output content:
This
is
a
photo
taken
on
the
beach
. In the photo, there is a
lady and a dog
. The lady is sitting on the sand
, smiling and
interacting with the dog.
The dog is wearing a collar
, seemingly shaking hands with the
lady. The background is the sea and sky
, with sunlight shining on
them, creating a warm
atmosphere.
Full content: This is a photo taken on the beach. In the photo, there is a lady and a dog. The lady is sitting on the sand, smiling and interacting with the dog. The dog is wearing a collar, seemingly shaking hands with the lady. The background is the sea and sky, with sunlight shining on them, creating a warm atmosphere.
Java
Sample code
import java.util.Arrays;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import io.reactivex.Flowable;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope-intl.aliyuncs.com/api/v1";
}
public static void streamCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
// must create mutable map.
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(new HashMap<String, Object>(){{put("image", "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg");}},
new HashMap<String, Object>(){{put("text", "What is this");}})).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// If the environment variable is not configured, please replace the following line with: .apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-max")
.message(userMessage)
.incrementalOutput(true)
.build();
Flowable<MultiModalConversationResult> result = conv.streamCall(param);
result.blockingForEach(item -> {
try {
System.out.println(item.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
} catch (Exception e){
System.exit(0);
}
});
}
public static void main(String[] args) {
try {
streamCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
Sample response
This
is
a
photo
taken
on
the
beach
. In the photo,
a lady in a plaid
shirt is sitting on
the sand, interacting with
a golden retriever
wearing a collar. The background is
the sea and sky,
with sunlight shining on them,
creating a warm
atmosphere.
cURL
Sample code
curl -X POST https://dashscope-intl.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H 'Content-Type: application/json' \
-H 'X-DashScope-SSE: enable' \
-d '{
"model": "qwen-vl-plus",
"input":{
"messages":[
{
"role": "system",
"content": [
{"text": "You are a helpful assistant."}
]
},
{
"role": "user",
"content": [
{"image": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"},
{"text": "Where is this image from?"}
]
}
]
},
"parameters": {
"incremental_output": true
}
}'
Sample response
id:1
event:result
:HTTP_STATUS/200
data:{"output":{"choices":[{"message":{"content":[{"text":"This"}],"role":"assistant"},"finish_reason":"null"}]},"usage":{"input_tokens":1278,"output_tokens":1,"image_tokens":1247},"request_id":"8b037000-c670-94cd-88d4-13318ddce1d0"}
id:2
event:result
:HTTP_STATUS/200
data:{"output":{"choices":[{"message":{"content":[{"text":"photo"}],"role":"assistant"},"finish_reason":"null"}]},"usage":{"input_tokens":1278,"output_tokens":2,"image_tokens":1247},"request_id":"8b037000-c670-94cd-88d4-13318ddce1d0"}
......
id:10
event:result
:HTTP_STATUS/200
data:{"output":{"choices":[{"message":{"content":[{"text":"lapping the shoreline and the distant horizon"}],"role":"assistant"},"finish_reason":"null"}]},"usage":{"input_tokens":1278,"output_tokens":56,"image_tokens":1247},"request_id":"8b037000-c670-94cd-88d4-13318ddce1d0"}
id:11
event:result
:HTTP_STATUS/200
data:{"output":{"choices":[{"message":{"content":[{"text":"with sunlight shining over."}],"role":"assistant"},"finish_reason":"stop"}]},"usage":{"input_tokens":1278,"output_tokens":63,"image_tokens":1247},"request_id":"8b037000-c670-94cd-88d4-13318ddce1d0"}
Use local files
The following sample codes show how to process local files with Qwen-VL models through OpenAI SDK or DashScope SDK. The image used in the samples is test.png.
OpenAI compatible
Python
Sample code
from openai import OpenAI
import os
import base64
# Base 64 encoding format
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
base64_image = encode_image("test.png")
client = OpenAI(
# If the environment variable is not configured, please replace the following line with: api_key="sk-xxx",
api_key=os.getenv('DASHSCOPE_API_KEY'),
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-max",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
},
{"type": "text", "text": "What is this"},
],
}
],
)
print(completion.choices[0].message.content)
Sample response
This is a flying eagle. Eagles are birds of prey, usually with strong wings and sharp claws, adept at soaring high and hunting. The eagle in the image is soaring high, with a background of blue sky and white clouds, looking very spectacular.
HTTP
Sample code
import os
import base64
import requests
# Base 64 encoding format
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
base64_image = encode_image("test.png")
# If the environment variable is not configured, please replace the following line with: api_key="sk-xxx",
api_key = os.getenv("DASHSCOPE_API_KEY")
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
payload = {
"model": "qwen-vl-max",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
},
{"type": "text", "text": "What is this"},
],
}
],
}
response = requests.post(
"https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions",
headers=headers,
json=payload,
)
print(response.json()["choices"][0]["message"]["content"])
Sample response
This is a flying eagle. Eagles are birds of prey, usually with strong wings and sharp claws, capable of soaring high and hunting prey. The eagle in the image is soaring high, with a background of blue sky and white clouds, looking very spectacular.
Node.js
Sample code
import OpenAI from "openai";
import { readFileSync } from 'fs';
const openai = new OpenAI(
{
// If the environment variable is not configured, please replace the following line with: apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
baseURL: "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
}
);
const encodeImage = (imagePath) => {
const imageFile = readFileSync(imagePath);
return imageFile.toString('base64');
};
const base64Image = encodeImage("test.png")
async function main() {
const completion = await openai.chat.completions.create({
model: "qwen-vl-max",
messages: [
{"role": "user",
"content": [{"type": "image_url",
"image_url": {"url": `data:image/jpeg;base64,${base64Image}`},},
{"type": "text", "text": "What is this"}]}]
});
console.log(completion.choices[0].message.content);
}
main();
Sample response
This is a flying eagle. Eagles are birds of prey, usually with strong wings and sharp claws, capable of soaring high and hunting prey. The eagle in the image is soaring high, with a background of blue sky and white clouds, looking very spectacular.
Supported image formats
Format | Content Type | File extension |
BMP | image/bmp | .bmp |
DIB | image/bmp | .dib |
ICNS | image/icns | .icns |
ICO | image/x-icon | .ico |
JPEG | image/jpeg | .jfif, .jpe, .jpeg, .jpg |
JPEG2000 | image/jp2 | .j2c, .j2k, .jp2, .jpc, .jpf, .jpx |
PNG | image/png | .apng, .png |
SGI | image/sgi | .bw, .rgb, .rgba, .sgi |
TIFF | image/tiff | .tif, .tiff |
WEBP | image/webp | .webp |
An input image must meet the following requirements:
The image must not exceed 10 MB in size.
For
qwen-vl-max
, the maximum pixel number per image is 12 million, accommodating standard 4K resolution. Forqwen-vl-plus
, the maximum pixel number per image is 1,048,576, which is the number of pixels in a 1024 x 1024 image.
FAQ
Is it necessary to manually delete uploaded images?
Answer: No, manual deletion is not required. Once the text generation is complete, the server automatically removes the images.
Can Qwen-VL process video content?
Answer: No. Currently, Qwen-VL models do not support video content.
Application examples
API reference
For more information about the input and output parameters, see Qwen.