Skip to content

Commit d73e034

Browse files
authored
feat: add support for image inputs in chat functionality (#179)
1 parent a60b864 commit d73e034

File tree

4 files changed

+171
-7
lines changed

4 files changed

+171
-7
lines changed

packages/core/src/agent/aipex.test.ts

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,134 @@ describe("AIPex", () => {
327327
expect(events[0]?.type).toBe("session_created");
328328
expect(agent.getConversationManager()).toBe(customManager);
329329
});
330+
331+
it("should pass images as multimodal AgentInputItem[] to run()", async () => {
332+
vi.mocked(run).mockResolvedValue(
333+
createMockRunResult({
334+
finalOutput: "I see a cat",
335+
streamEvents: [
336+
{
337+
type: "raw_model_stream_event",
338+
data: { type: "output_text_delta", delta: "I see a cat" },
339+
},
340+
],
341+
}),
342+
);
343+
344+
const agent = AIPex.create({
345+
instructions: "Describe images",
346+
model: mockModel,
347+
});
348+
349+
const events: AgentEvent[] = [];
350+
for await (const event of agent.chat("What is in this image?", {
351+
images: [{ image: "data:image/png;base64,abc123", detail: "high" }],
352+
})) {
353+
events.push(event);
354+
}
355+
356+
expect(run).toHaveBeenCalledTimes(1);
357+
const runCallArgs = vi.mocked(run).mock.calls[0]!;
358+
const input = runCallArgs[1] as Array<{
359+
type: string;
360+
role: string;
361+
content: Array<{
362+
type: string;
363+
text?: string;
364+
image?: string;
365+
detail?: string;
366+
}>;
367+
}>;
368+
369+
expect(Array.isArray(input)).toBe(true);
370+
expect(input).toHaveLength(1);
371+
expect(input[0]!.role).toBe("user");
372+
expect(input[0]!.content).toHaveLength(2);
373+
expect(input[0]!.content[0]).toEqual({
374+
type: "input_text",
375+
text: "What is in this image?",
376+
});
377+
expect(input[0]!.content[1]).toEqual({
378+
type: "input_image",
379+
image: "data:image/png;base64,abc123",
380+
detail: "high",
381+
});
382+
});
383+
384+
it("should default image detail to 'auto' when not specified", async () => {
385+
vi.mocked(run).mockResolvedValue(
386+
createMockRunResult({ finalOutput: "OK" }),
387+
);
388+
389+
const agent = AIPex.create({
390+
instructions: "Test",
391+
model: mockModel,
392+
});
393+
394+
for await (const _ of agent.chat("Describe", {
395+
images: [{ image: "https://example.com/img.png" }],
396+
})) {
397+
// consume
398+
}
399+
400+
const runCallArgs = vi.mocked(run).mock.calls[0]!;
401+
const input = runCallArgs[1] as Array<{
402+
content: Array<{ type: string; detail?: string }>;
403+
}>;
404+
const imagePart = input[0]!.content[1]!;
405+
expect(imagePart.detail).toBe("auto");
406+
});
407+
408+
it("should support multiple images in a single message", async () => {
409+
vi.mocked(run).mockResolvedValue(
410+
createMockRunResult({ finalOutput: "Two images" }),
411+
);
412+
413+
const agent = AIPex.create({
414+
instructions: "Test",
415+
model: mockModel,
416+
});
417+
418+
for await (const _ of agent.chat("Compare these", {
419+
images: [
420+
{ image: "img1_base64" },
421+
{ image: "img2_base64", detail: "low" },
422+
],
423+
})) {
424+
// consume
425+
}
426+
427+
const runCallArgs = vi.mocked(run).mock.calls[0]!;
428+
const input = runCallArgs[1] as Array<{
429+
content: Array<{ type: string; image?: string; detail?: string }>;
430+
}>;
431+
expect(input[0]!.content).toHaveLength(3);
432+
expect(input[0]!.content[0]!.type).toBe("input_text");
433+
expect(input[0]!.content[1]!.type).toBe("input_image");
434+
expect(input[0]!.content[1]!.image).toBe("img1_base64");
435+
expect(input[0]!.content[2]!.type).toBe("input_image");
436+
expect(input[0]!.content[2]!.image).toBe("img2_base64");
437+
expect(input[0]!.content[2]!.detail).toBe("low");
438+
});
439+
440+
it("should pass plain string to run() when no images provided", async () => {
441+
vi.mocked(run).mockResolvedValue(
442+
createMockRunResult({ finalOutput: "Reply" }),
443+
);
444+
445+
const agent = AIPex.create({
446+
instructions: "Test",
447+
model: mockModel,
448+
});
449+
450+
for await (const _ of agent.chat("Hello")) {
451+
// consume
452+
}
453+
454+
const runCallArgs = vi.mocked(run).mock.calls[0]!;
455+
expect(typeof runCallArgs[1]).toBe("string");
456+
expect(runCallArgs[1]).toBe("Hello");
457+
});
330458
});
331459

332460
describe("chat - continue conversation", () => {

packages/core/src/agent/aipex.ts

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import {
2+
type AgentInputItem,
23
Agent as OpenAIAgent,
34
type RunItemStreamEvent,
45
run,
@@ -116,7 +117,7 @@ export class AIPex {
116117
}
117118

118119
private async *runExecution(
119-
input: string,
120+
input: string | AgentInputItem[],
120121
session: Session | null,
121122
): AsyncGenerator<AgentEvent> {
122123
const startTime = Date.now();
@@ -344,7 +345,7 @@ export class AIPex {
344345
input: string,
345346
options?: ChatOptions,
346347
): AsyncGenerator<AgentEvent> {
347-
let finalInput = input;
348+
let finalTextInput = input;
348349
let chatOptions = options;
349350
let resolvedContexts: Context[] | undefined;
350351

@@ -367,7 +368,7 @@ export class AIPex {
367368
resolvedContexts = contextObjs;
368369
// Format contexts and prepend to input
369370
const contextText = formatContextsForPrompt(contextObjs);
370-
finalInput = `${contextText}\n\n${input}`;
371+
finalTextInput = `${contextText}\n\n${input}`;
371372

372373
yield { type: "contexts_attached", contexts: contextObjs };
373374
}
@@ -382,11 +383,11 @@ export class AIPex {
382383
}
383384

384385
const beforeChat = await this.runBeforeChatHooks({
385-
input: finalInput,
386+
input: finalTextInput,
386387
options: chatOptions,
387388
contexts: resolvedContexts,
388389
});
389-
finalInput = beforeChat.input;
390+
let finalInput: string | AgentInputItem[] = beforeChat.input;
390391
if (beforeChat.options) {
391392
chatOptions = { ...(chatOptions ?? {}), ...beforeChat.options };
392393
}
@@ -395,6 +396,27 @@ export class AIPex {
395396
chatOptions = { ...(chatOptions ?? {}), contexts: beforeChat.contexts };
396397
}
397398

399+
// When images are provided, build a multimodal UserMessageItem
400+
const images = chatOptions?.images;
401+
if (images && images.length > 0 && typeof finalInput === "string") {
402+
const contentParts: Array<
403+
| { type: "input_text"; text: string }
404+
| { type: "input_image"; image: string; detail?: string }
405+
> = [{ type: "input_text", text: finalInput }];
406+
407+
for (const img of images) {
408+
contentParts.push({
409+
type: "input_image",
410+
image: img.image,
411+
detail: img.detail ?? "auto",
412+
});
413+
}
414+
415+
finalInput = [
416+
{ type: "message", role: "user", content: contentParts },
417+
] as AgentInputItem[];
418+
}
419+
398420
// If sessionId is provided, continue existing conversation
399421
if (chatOptions?.sessionId) {
400422
if (!this.conversationManager) {

packages/core/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ export type {
8282
ConversationConfig,
8383
ForkInfo,
8484
FunctionTool,
85+
ImageInput,
8586
MetricsPayload,
8687
OpenAIAgent,
8788
SerializedSession,

packages/core/src/types.ts

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,13 @@ export interface CompressionOptions extends CompressionConfig {
8787
model: AiSdkModel;
8888
}
8989

90+
export interface ImageInput {
91+
/** base64-encoded image data, a URL, or a file ID */
92+
image: string;
93+
/** Vision detail level. Defaults to "auto". */
94+
detail?: "auto" | "low" | "high";
95+
}
96+
9097
export interface ChatOptions {
9198
sessionId?: string;
9299
/**
@@ -95,6 +102,12 @@ export interface ChatOptions {
95102
* Context IDs will be resolved using the ContextManager.
96103
*/
97104
contexts?: Context[] | string[];
105+
/**
106+
* Images to include with this message.
107+
* When provided, the text input and images are combined into a
108+
* multimodal UserMessageItem sent to the model's vision path.
109+
*/
110+
images?: ImageInput[];
98111
}
99112

100113
export interface AgentMetrics {
@@ -136,13 +149,13 @@ export type AgentEvent =
136149
// ============================================================================
137150

138151
export interface BeforeChatPayload {
139-
input: string;
152+
input: string | AgentInputItem[];
140153
options?: ChatOptions;
141154
contexts?: Context[];
142155
}
143156

144157
export interface AfterResponsePayload {
145-
input: string;
158+
input: string | AgentInputItem[];
146159
finalOutput: string;
147160
metrics: AgentMetrics;
148161
sessionId?: string;

0 commit comments

Comments
 (0)