Skip to content

Commit e2797a7

Browse files
committed
feat: add base ocr example
1 parent 0b59528 commit e2797a7

16 files changed

+880
-89
lines changed

README.md

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -275,23 +275,23 @@ while (true)
275275
Console.Write("Reasoning > ");
276276
reasoning = true;
277277
}
278-
278+
279279
Console.Write(choice.Message.ReasoningContent);
280280
continue;
281281
}
282-
282+
283283
if (reasoning)
284284
{
285285
reasoning = false;
286286
Console.WriteLine();
287287
Console.Write("Assistant > ");
288288
}
289-
289+
290290
Console.Write(choice.Message.Content);
291291
reply.Append(choice.Message.Content);
292292
usage = chunk.Usage;
293293
}
294-
294+
295295
Console.WriteLine();
296296
messages.Add(TextChatMessage.Assistant(reply.ToString()));
297297
if (usage != null)
@@ -593,6 +593,63 @@ await foreach (var modelResponse in response)
593593
}
594594
```
595595

596+
### OCR
597+
598+
Base example of OCR
599+
600+
```csharp
601+
// upload file
602+
await using var tilted = File.OpenRead("tilted.png");
603+
var ossLink = await client.UploadTemporaryFileAsync("qwen-vl-ocr-latest", tilted, "tilted.jpg");
604+
Console.WriteLine($"File uploaded: {ossLink}");
605+
var messages = new List<MultimodalMessage>();
606+
messages.Add(
607+
MultimodalMessage.User(
608+
[
609+
// set enableRotate to true if your source image is tilted.
610+
MultimodalMessageContent.ImageContent(ossLink, enableRotate: true),
611+
]));
612+
var completion = client.GetMultimodalGenerationStreamAsync(
613+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
614+
{
615+
Model = "qwen-vl-ocr-latest",
616+
Input = new MultimodalInput() { Messages = messages },
617+
Parameters = new MultimodalParameters()
618+
{
619+
IncrementalOutput = true,
620+
}
621+
});
622+
var reply = new StringBuilder();
623+
var first = false;
624+
MultimodalTokenUsage? usage = null;
625+
await foreach (var chunk in completion)
626+
{
627+
var choice = chunk.Output.Choices[0];
628+
if (first)
629+
{
630+
first = false;
631+
Console.Write("Assistant > ");
632+
}
633+
634+
if (choice.Message.Content.Count == 0)
635+
{
636+
continue;
637+
}
638+
639+
Console.Write(choice.Message.Content[0].Text);
640+
reply.Append(choice.Message.Content[0].Text);
641+
usage = chunk.Usage;
642+
}
643+
644+
Console.WriteLine();
645+
messages.Add(MultimodalMessage.Assistant([MultimodalMessageContent.TextContent(reply.ToString())]));
646+
if (usage != null)
647+
{
648+
Console.WriteLine(
649+
$"Usage: in({usage.InputTokens})/out({usage.OutputTokens})/image({usage.ImageTokens})/total({usage.TotalTokens})");
650+
}
651+
```
652+
596653
## Text-to-Speech
597654

598655
Create a speech synthesis session using `dashScopeClient.CreateSpeechSynthesizerSocketSessionAsync()`.

README.zh-Hans.md

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ public class YourService(IDashScopeClient client)
9999
- [长上下文(Qwen-Long)](#长上下文(Qwen-Long))
100100

101101
- [多模态](#多模态) - QWen-VL,QVQ 等,支持推理/视觉理解/OCR/音频理解等场景
102+
- [视觉理解/推理](#视觉理解/推理) - 图像/视频输入与理解,支持推理模式
103+
- [文字提取](#文字提取) - OCR 任务,读取表格/文档/公式等
104+
102105
- [语音合成](#语音合成) - CosyVoice,Sambert 等,支持 TTS 等应用场景
103106
- [图像生成](#图像生成) - wanx2.1 等,支持文生图,人像风格重绘等应用场景
104107
- [应用调用](#应用调用)
@@ -2445,6 +2448,105 @@ messages.Add(
24452448
]));
24462449
```
24472450
2451+
### 文字提取
2452+
2453+
使用 `qwen-vl-ocr` 系列模型可以很好的完成文字提取任务,基础用法(使用本地文件):
2454+
2455+
```csharp
2456+
// upload file
2457+
await using var tilted = File.OpenRead("tilted.png");
2458+
var ossLink = await client.UploadTemporaryFileAsync("qwen-vl-ocr-latest", tilted, "tilted.jpg");
2459+
Console.WriteLine($"File uploaded: {ossLink}");
2460+
var messages = new List<MultimodalMessage>();
2461+
messages.Add(
2462+
MultimodalMessage.User(
2463+
[
2464+
// 如果你的图片存在偏斜,可尝试将 enableRotate 设置为 true
2465+
MultimodalMessageContent.ImageContent(ossLink, enableRotate: true),
2466+
]));
2467+
var completion = client.GetMultimodalGenerationStreamAsync(
2468+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
2469+
{
2470+
Model = "qwen-vl-ocr-latest",
2471+
Input = new MultimodalInput { Messages = messages },
2472+
Parameters = new MultimodalParameters
2473+
{
2474+
IncrementalOutput = true,
2475+
}
2476+
});
2477+
```
2478+
2479+
完整示例:
2480+
2481+
```csharp
2482+
// upload file
2483+
await using var tilted = File.OpenRead("tilted.png");
2484+
var ossLink = await client.UploadTemporaryFileAsync("qwen-vl-ocr-latest", tilted, "tilted.jpg");
2485+
Console.WriteLine($"File uploaded: {ossLink}");
2486+
var messages = new List<MultimodalMessage>();
2487+
messages.Add(
2488+
MultimodalMessage.User(
2489+
[
2490+
MultimodalMessageContent.ImageContent(ossLink, enableRotate: true),
2491+
]));
2492+
var completion = client.GetMultimodalGenerationStreamAsync(
2493+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
2494+
{
2495+
Model = "qwen-vl-ocr-latest",
2496+
Input = new MultimodalInput() { Messages = messages },
2497+
Parameters = new MultimodalParameters()
2498+
{
2499+
IncrementalOutput = true,
2500+
}
2501+
});
2502+
var reply = new StringBuilder();
2503+
var first = false;
2504+
MultimodalTokenUsage? usage = null;
2505+
await foreach (var chunk in completion)
2506+
{
2507+
var choice = chunk.Output.Choices[0];
2508+
if (first)
2509+
{
2510+
first = false;
2511+
Console.Write("Assistant > ");
2512+
}
2513+
2514+
if (choice.Message.Content.Count == 0)
2515+
{
2516+
continue;
2517+
}
2518+
2519+
Console.Write(choice.Message.Content[0].Text);
2520+
reply.Append(choice.Message.Content[0].Text);
2521+
usage = chunk.Usage;
2522+
}
2523+
2524+
Console.WriteLine();
2525+
messages.Add(MultimodalMessage.Assistant([MultimodalMessageContent.TextContent(reply.ToString())]));
2526+
if (usage != null)
2527+
{
2528+
Console.WriteLine(
2529+
$"Usage: in({usage.InputTokens})/out({usage.OutputTokens})/image({usage.ImageTokens})/total({usage.TotalTokens})");
2530+
}
2531+
2532+
/*
2533+
File uploaded: oss://dashscope-instant/52afe077fb4825c6d74411758cb1ab98/2025-11-28/435ea45f-9942-4fd4-983a-9ea8a3cd5ecb/tilted.jpg
2534+
产品介绍
2535+
本品采用韩国进口纤维丝制造,不缩水、不变形、不发霉、
2536+
不生菌、不伤物品表面。具有真正的不粘油、吸水力强、耐水
2537+
浸、清洗干净、无毒、无残留、易晾干等特点。
2538+
店家使用经验:不锈钢、陶瓷制品、浴盆、整体浴室大部分是
2539+
白色的光洁表面,用其他的抹布擦洗表面污渍不易洗掉,太尖
2540+
的容易划出划痕。使用这个仿真丝瓜布,沾少量中性洗涤剂揉
2541+
出泡沫,很容易把这些表面污渍擦洗干净。
2542+
6941990612023
2543+
货号:2023
2544+
Usage: in(2434)/out(155)/image(2410)/total(2589)
2545+
*/
2546+
```
2547+
2548+
2549+
24482550
## 语音合成
24492551
24502552
通过 `dashScopeClient.CreateSpeechSynthesizerSocketSessionAsync()` 来创建一个语音合成会话。

sample/Cnblogs.DashScope.Sample/Cnblogs.DashScope.Sample.csproj

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
<None Update="sample.mp4">
3030
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
3131
</None>
32+
<None Update="tilted.png">
33+
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
34+
</None>
3235
</ItemGroup>
3336

3437
<ItemGroup>
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
using System.Text;
2+
using Cnblogs.DashScope.Core;
3+
4+
namespace Cnblogs.DashScope.Sample.Multimodal;
5+
6+
public class OcrSample : ISample
7+
{
8+
/// <inheritdoc />
9+
public string Description => "OCR Sample with rotate enabled";
10+
11+
/// <inheritdoc />
12+
public async Task RunAsync(IDashScopeClient client)
13+
{
14+
// upload file
15+
await using var tilted = File.OpenRead("tilted.png");
16+
var ossLink = await client.UploadTemporaryFileAsync("qwen-vl-ocr-latest", tilted, "tilted.jpg");
17+
Console.WriteLine($"File uploaded: {ossLink}");
18+
var messages = new List<MultimodalMessage>();
19+
messages.Add(
20+
MultimodalMessage.User(
21+
[
22+
MultimodalMessageContent.ImageContent(ossLink, enableRotate: true),
23+
]));
24+
var completion = client.GetMultimodalGenerationStreamAsync(
25+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
26+
{
27+
Model = "qwen-vl-ocr-latest",
28+
Input = new MultimodalInput() { Messages = messages },
29+
Parameters = new MultimodalParameters()
30+
{
31+
IncrementalOutput = true,
32+
}
33+
});
34+
var reply = new StringBuilder();
35+
var first = false;
36+
MultimodalTokenUsage? usage = null;
37+
await foreach (var chunk in completion)
38+
{
39+
var choice = chunk.Output.Choices[0];
40+
if (first)
41+
{
42+
first = false;
43+
Console.Write("Assistant > ");
44+
}
45+
46+
if (choice.Message.Content.Count == 0)
47+
{
48+
continue;
49+
}
50+
51+
Console.Write(choice.Message.Content[0].Text);
52+
reply.Append(choice.Message.Content[0].Text);
53+
usage = chunk.Usage;
54+
}
55+
56+
Console.WriteLine();
57+
messages.Add(MultimodalMessage.Assistant([MultimodalMessageContent.TextContent(reply.ToString())]));
58+
if (usage != null)
59+
{
60+
Console.WriteLine(
61+
$"Usage: in({usage.InputTokens})/out({usage.OutputTokens})/image({usage.ImageTokens})/total({usage.TotalTokens})");
62+
}
63+
}
64+
}
65+
66+
/*
67+
File uploaded: oss://dashscope-instant/52afe077fb4825c6d74411758cb1ab98/2025-11-28/435ea45f-9942-4fd4-983a-9ea8a3cd5ecb/tilted.jpg
68+
产品介绍
69+
本品采用韩国进口纤维丝制造,不缩水、不变形、不发霉、
70+
不生菌、不伤物品表面。具有真正的不粘油、吸水力强、耐水
71+
浸、清洗干净、无毒、无残留、易晾干等特点。
72+
店家使用经验:不锈钢、陶瓷制品、浴盆、整体浴室大部分是
73+
白色的光洁表面,用其他的抹布擦洗表面污渍不易洗掉,太尖
74+
的容易划出划痕。使用这个仿真丝瓜布,沾少量中性洗涤剂揉
75+
出泡沫,很容易把这些表面污渍擦洗干净。
76+
6941990612023
77+
货号:2023
78+
Usage: in(2434)/out(155)/image(2410)/total(2589)
79+
*/
1.82 MB
Loading

src/Cnblogs.DashScope.Core/MultimodalMessageContent.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ public record MultimodalMessageContent(
1919
int? MinPixels = null,
2020
int? MaxPixels = null,
2121
bool? EnableRotate = null,
22-
float? Fps = null)
22+
float? Fps = null,
23+
MultimodalOcrResult? OcrResult = null)
2324
{
2425
private const string OssSchema = "oss://";
2526

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
namespace Cnblogs.DashScope.Core;
2+
3+
/// <summary>
4+
/// Options for OCR model.
5+
/// </summary>
6+
public class MultimodalOcrOptions
7+
{
8+
/// <summary>
9+
/// Name of the task.
10+
/// </summary>
11+
/// <example>
12+
/// Some task example: "text_recognition", "key_information_extraction", "document_parsing", "table_parsing"
13+
/// </example>
14+
public string? Task { get; set; }
15+
16+
/// <summary>
17+
/// Config for the task.
18+
/// </summary>
19+
public MultimodalOcrTaskConfig? TaskConfig { get; set; }
20+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
namespace Cnblogs.DashScope.Core;
2+
3+
/// <summary>
4+
/// OCR result from the model.
5+
/// </summary>
6+
/// <param name="WordsInfo">The words that model recognized.</param>
7+
/// <param name="KvResult">Meta info that extracted from the image.</param>
8+
public record MultimodalOcrResult(List<MultimodalOcrWordInfo>? WordsInfo, Dictionary<string, object?> KvResult);
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
namespace Cnblogs.DashScope.Core;
2+
3+
/// <summary>
4+
/// Configuration of OCR task.
5+
/// </summary>
6+
public class MultimodalOcrTaskConfig
7+
{
8+
/// <summary>
9+
/// The resulting JSON schema, value should be empty string.
10+
/// </summary>
11+
/// <example>
12+
/// <code>
13+
/// var schema = new Dictionary&lt;string, object&gt;()
14+
/// {
15+
/// {
16+
/// "收件人信息",
17+
/// new Dictionary&lt;string, object&gt;()
18+
/// {
19+
/// "收件人姓名", string.Empty,
20+
/// "收件人电话号码", string.Empty,
21+
/// "收件人地址", string.Empty
22+
/// }
23+
/// }
24+
/// }
25+
/// </code>
26+
/// </example>
27+
public Dictionary<string, object>? ResultSchema { get; set; }
28+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
namespace Cnblogs.DashScope.Core;
2+
3+
/// <summary>
4+
/// Word location and info recognized by the model.
5+
/// </summary>
6+
/// <param name="Text">OCR result.</param>
7+
/// <param name="RotateRect">Four point of the word rect, (0, 0) is the left-top of the image. The points are clockwise, starting from the left-top of the rect. e.g. [x1, y1, x2, y2, x3, y3, x4, y4]</param>
8+
/// <param name="Location">Another presentation of the word rect. First two are the center point of the rect, then follows the width and height. The last value is rect's rotate angle from the landscape. e.g. [center_x, center_y, width, height, angle]</param>
9+
public record MultimodalOcrWordInfo(string Text, int[] RotateRect, int[] Location);

0 commit comments

Comments
 (0)