AIPexStudio
diff --git a/‎packages/core/src/agent/aipex.test.ts‎
Lines changed: 128 additions & 0 deletions b/‎packages/core/src/agent/aipex.test.ts‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎packages/core/src/agent/aipex.ts‎
Lines changed: 27 additions & 5 deletions b/‎packages/core/src/agent/aipex.ts‎
Lines changed: 27 additions & 5 deletions
diff --git a/‎packages/core/src/index.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/core/src/index.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/core/src/types.ts‎
Lines changed: 15 additions & 2 deletions b/‎packages/core/src/types.ts‎
Lines changed: 15 additions & 2 deletions
@@ -327,6 +327,134 @@ describe("AIPex", () => {
  expect(events[0]?.type).toBe("session_created");
  expect(agent.getConversationManager()).toBe(customManager);
  });
+
+ it("should pass images as multimodal AgentInputItem[] to run()", async () => {
+ vi.mocked(run).mockResolvedValue(
+ createMockRunResult({
+ finalOutput: "I see a cat",
+ streamEvents: [
+ {
+ type: "raw_model_stream_event",
+ data: { type: "output_text_delta", delta: "I see a cat" },
+ },
+ ],
+ }),
+ );
+
+ const agent = AIPex.create({
+ instructions: "Describe images",
+ model: mockModel,
+ });
+
+ const events: AgentEvent[] = [];
+ for await (const event of agent.chat("What is in this image?", {
+ images: [{ image: "data:image/png;base64,abc123", detail: "high" }],
+ })) {
+ events.push(event);
+ }
+
+ expect(run).toHaveBeenCalledTimes(1);
+ const runCallArgs = vi.mocked(run).mock.calls[0]!;
+ const input = runCallArgs[1] as Array<{
+ type: string;
+ role: string;
+ content: Array<{
+ type: string;
+ text?: string;
+ image?: string;
+ detail?: string;
+ }>;
+ }>;
+
+ expect(Array.isArray(input)).toBe(true);
+ expect(input).toHaveLength(1);
+ expect(input[0]!.role).toBe("user");
+ expect(input[0]!.content).toHaveLength(2);
+ expect(input[0]!.content[0]).toEqual({
+ type: "input_text",
+ text: "What is in this image?",
+ });
+ expect(input[0]!.content[1]).toEqual({
+ type: "input_image",
+ image: "data:image/png;base64,abc123",
+ detail: "high",
+ });
+ });
+
+ it("should default image detail to 'auto' when not specified", async () => {
+ vi.mocked(run).mockResolvedValue(
+ createMockRunResult({ finalOutput: "OK" }),
+ );
+
+ const agent = AIPex.create({
+ instructions: "Test",
+ model: mockModel,
+ });
+
+ for await (const _ of agent.chat("Describe", {
+ images: [{ image: "https://example.com/img.png" }],
+ })) {
+ // consume
+ }
+
+ const runCallArgs = vi.mocked(run).mock.calls[0]!;
+ const input = runCallArgs[1] as Array<{
+ content: Array<{ type: string; detail?: string }>;
+ }>;
+ const imagePart = input[0]!.content[1]!;
+ expect(imagePart.detail).toBe("auto");
+ });
+
+ it("should support multiple images in a single message", async () => {
+ vi.mocked(run).mockResolvedValue(
+ createMockRunResult({ finalOutput: "Two images" }),
+ );
+
+ const agent = AIPex.create({
+ instructions: "Test",
+ model: mockModel,
+ });
+
+ for await (const _ of agent.chat("Compare these", {
+ images: [
+ { image: "img1_base64" },
+ { image: "img2_base64", detail: "low" },
+ ],
+ })) {
+ // consume
+ }
+
+ const runCallArgs = vi.mocked(run).mock.calls[0]!;
+ const input = runCallArgs[1] as Array<{
+ content: Array<{ type: string; image?: string; detail?: string }>;
+ }>;
+ expect(input[0]!.content).toHaveLength(3);
+ expect(input[0]!.content[0]!.type).toBe("input_text");
+ expect(input[0]!.content[1]!.type).toBe("input_image");
+ expect(input[0]!.content[1]!.image).toBe("img1_base64");
+ expect(input[0]!.content[2]!.type).toBe("input_image");
+ expect(input[0]!.content[2]!.image).toBe("img2_base64");
+ expect(input[0]!.content[2]!.detail).toBe("low");
+ });
+
+ it("should pass plain string to run() when no images provided", async () => {
+ vi.mocked(run).mockResolvedValue(
+ createMockRunResult({ finalOutput: "Reply" }),
+ );
+
+ const agent = AIPex.create({
+ instructions: "Test",
+ model: mockModel,
+ });
+
+ for await (const _ of agent.chat("Hello")) {
+ // consume
+ }
+
+ const runCallArgs = vi.mocked(run).mock.calls[0]!;
+ expect(typeof runCallArgs[1]).toBe("string");
+ expect(runCallArgs[1]).toBe("Hello");
+ });
  });
 
  describe("chat - continue conversation", () => {
 
@@ -1,4 +1,5 @@
 import {
+ type AgentInputItem,
  Agent as OpenAIAgent,
  type RunItemStreamEvent,
  run,
@@ -116,7 +117,7 @@ export class AIPex {
  }
 
  private async *runExecution(
- input: string,
+ input: string | AgentInputItem[],
  session: Session | null,
  ): AsyncGenerator<AgentEvent> {
  const startTime = Date.now();
@@ -344,7 +345,7 @@ export class AIPex {
  input: string,
  options?: ChatOptions,
  ): AsyncGenerator<AgentEvent> {
- let finalInput = input;
+ let finalTextInput = input;
  let chatOptions = options;
  let resolvedContexts: Context[] | undefined;
 
@@ -367,7 +368,7 @@ export class AIPex {
  resolvedContexts = contextObjs;
  // Format contexts and prepend to input
  const contextText = formatContextsForPrompt(contextObjs);
- finalInput = `${contextText}\n\n${input}`;
+ finalTextInput = `${contextText}\n\n${input}`;
 
  yield { type: "contexts_attached", contexts: contextObjs };
  }
@@ -382,11 +383,11 @@ export class AIPex {
  }
 
  const beforeChat = await this.runBeforeChatHooks({
- input: finalInput,
+ input: finalTextInput,
  options: chatOptions,
  contexts: resolvedContexts,
  });
- finalInput = beforeChat.input;
+ let finalInput: string | AgentInputItem[] = beforeChat.input;
  if (beforeChat.options) {
  chatOptions = { ...(chatOptions ?? {}), ...beforeChat.options };
  }
@@ -395,6 +396,27 @@ export class AIPex {
  chatOptions = { ...(chatOptions ?? {}), contexts: beforeChat.contexts };
  }
 
+ // When images are provided, build a multimodal UserMessageItem
+ const images = chatOptions?.images;
+ if (images && images.length > 0 && typeof finalInput === "string") {
+ const contentParts: Array<
+ | { type: "input_text"; text: string }
+ | { type: "input_image"; image: string; detail?: string }
+ > = [{ type: "input_text", text: finalInput }];
+
+ for (const img of images) {
+ contentParts.push({
+ type: "input_image",
+ image: img.image,
+ detail: img.detail ?? "auto",
+ });
+ }
+
+ finalInput = [
+ { type: "message", role: "user", content: contentParts },
+ ] as AgentInputItem[];
+ }
+
  // If sessionId is provided, continue existing conversation
  if (chatOptions?.sessionId) {
  if (!this.conversationManager) {
 
@@ -82,6 +82,7 @@ export type {
  ConversationConfig,
  ForkInfo,
  FunctionTool,
+ ImageInput,
  MetricsPayload,
  OpenAIAgent,
  SerializedSession,
 
@@ -87,6 +87,13 @@ export interface CompressionOptions extends CompressionConfig {
  model: AiSdkModel;
 }
 
+export interface ImageInput {
+ /** base64-encoded image data, a URL, or a file ID */
+ image: string;
+ /** Vision detail level. Defaults to "auto". */
+ detail?: "auto" | "low" | "high";
+}
+
 export interface ChatOptions {
  sessionId?: string;
  /**
@@ -95,6 +102,12 @@ export interface ChatOptions {
  * Context IDs will be resolved using the ContextManager.
  */
  contexts?: Context[] | string[];
+ /**
+ * Images to include with this message.
+ * When provided, the text input and images are combined into a
+ * multimodal UserMessageItem sent to the model's vision path.
+ */
+ images?: ImageInput[];
 }
 
 export interface AgentMetrics {
@@ -136,13 +149,13 @@ export type AgentEvent =
 // ============================================================================
 
 export interface BeforeChatPayload {
- input: string;
+ input: string | AgentInputItem[];
  options?: ChatOptions;
  contexts?: Context[];
 }
 
 export interface AfterResponsePayload {
- input: string;
+ input: string | AgentInputItem[];
  finalOutput: string;
  metrics: AgentMetrics;
  sessionId?: string;