Bounding box detection

In this experimental launch, we are providing developers with a powerful tool for object detection and localization within images and video. By accurately identifying and delineating objects with bounding boxes, developers can unlock a wide range of applications and enhance the intelligence of their projects.

Key Benefits:

  • Simple: Integrate object detection capabilities into your applications with ease, regardless of your computer vision expertise.
  • Customizable: Produce bounding boxes based on custom instructions (e.g. "I want to see bounding boxes of all the green objects in this image"), without having to train a custom model.

Technical Details:

  • Input: Your prompt and associated images or video frames.
  • Output: Bounding boxes in the [y_min, x_min, y_max, x_max] format. The top left corner is the origin. The x and y axis go horizontally and vertically, respectively. Coordinate values are normalized to 0-1000 for every image.
  • Visualization: AI Studio users will see bounding boxes plotted within the UI. Vertex AI users should visualize their bounding boxes through custom visualization code.

Python

Install

pip install --upgrade google-genai

To learn more, see the SDK reference documentation.

Set environment variables to use the Gen AI SDK with Vertex AI:

# Replace the `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION` values # with appropriate values for your project. export GOOGLE_CLOUD_PROJECT=GOOGLE_CLOUD_PROJECT export GOOGLE_CLOUD_LOCATION=global export GOOGLE_GENAI_USE_VERTEXAI=True

import requests from google import genai from google.genai.types import ( GenerateContentConfig, HarmBlockThreshold, HarmCategory, HttpOptions, Part, SafetySetting, ) from PIL import Image, ImageColor, ImageDraw from pydantic import BaseModel # Helper class to represent a bounding box class BoundingBox(BaseModel):  """  Represents a bounding box with its 2D coordinates and associated label.  Attributes:  box_2d (list[int]): A list of integers representing the 2D coordinates of the bounding box,  typically in the format [y_min, x_min, y_max, x_max].  label (str): A string representing the label or class associated with the object within the bounding box.  """ box_2d: list[int] label: str # Helper function to plot bounding boxes on an image def plot_bounding_boxes(image_uri: str, bounding_boxes: list[BoundingBox]) -> None:  """  Plots bounding boxes on an image with labels, using PIL and normalized coordinates.  Args:  image_uri: The URI of the image file.  bounding_boxes: A list of BoundingBox objects. Each box's coordinates are in  normalized [y_min, x_min, y_max, x_max] format.  """ with Image.open(requests.get(image_uri, stream=True, timeout=10).raw) as im: width, height = im.size draw = ImageDraw.Draw(im) colors = list(ImageColor.colormap.keys()) for i, bbox in enumerate(bounding_boxes): # Scale normalized coordinates to image dimensions abs_y_min = int(bbox.box_2d[0] / 1000 * height) abs_x_min = int(bbox.box_2d[1] / 1000 * width) abs_y_max = int(bbox.box_2d[2] / 1000 * height) abs_x_max = int(bbox.box_2d[3] / 1000 * width) color = colors[i % len(colors)] # Draw the rectangle using the correct (x, y) pairs draw.rectangle( ((abs_x_min, abs_y_min), (abs_x_max, abs_y_max)), outline=color, width=4, ) if bbox.label: # Position the text at the top-left corner of the box draw.text((abs_x_min + 8, abs_y_min + 6), bbox.label, fill=color) im.show() client = genai.Client(http_options=HttpOptions(api_version="v1")) config = GenerateContentConfig( system_instruction="""  Return bounding boxes as an array with labels.  Never return masks. Limit to 25 objects.  If an object is present multiple times, give each object a unique label  according to its distinct characteristics (colors, size, position, etc..).  """, temperature=0.5, safety_settings=[ SafetySetting( category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH, ), ], response_mime_type="application/json", response_schema=list[BoundingBox], ) image_uri = "https://storage.googleapis.com/generativeai-downloads/images/socks.jpg" response = client.models.generate_content( model="gemini-2.5-flash", contents=[ Part.from_uri( file_uri=image_uri, mime_type="image/jpeg", ), "Output the positions of the socks with a face. Label according to position in the image.", ], config=config, ) print(response.text) plot_bounding_boxes(image_uri, response.parsed) # Example response: # [ # {"box_2d": [6, 246, 386, 526], "label": "top-left light blue sock with cat face"}, # {"box_2d": [234, 649, 650, 863], "label": "top-right light blue sock with cat face"}, # ]

Go

Learn how to install or update the Go.

To learn more, see the SDK reference documentation.

Set environment variables to use the Gen AI SDK with Vertex AI:

# Replace the `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION` values # with appropriate values for your project. export GOOGLE_CLOUD_PROJECT=GOOGLE_CLOUD_PROJECT export GOOGLE_CLOUD_LOCATION=global export GOOGLE_GENAI_USE_VERTEXAI=True

import ( "context" "encoding/json" "fmt" "image" "image/color" "image/draw" "image/jpeg" "io" "net/http" "google.golang.org/genai" ) // BoundingBox represents a bounding box with coordinates and label. type BoundingBox struct { Box2D []int `json:"box_2d"` Label string `json:"label"` } // plotBoundingBoxes downloads the image and overlays bounding boxes. func plotBoundingBoxes(imageURI string, boundingBoxes []BoundingBox) error { resp, err := http.Get(imageURI) if err != nil { return fmt.Errorf("failed to download image: %w", err) } defer resp.Body.Close() img, err := jpeg.Decode(resp.Body) if err != nil { return fmt.Errorf("failed to decode image: %w", err) } bounds := img.Bounds() rgba := image.NewRGBA(bounds) draw.Draw(rgba, bounds, img, bounds.Min, draw.Src) // Simple red color for bounding boxes red := color.RGBA{255, 0, 0, 255} for _, bbox := range boundingBoxes { // scale normalized coordinates [01000] to absolute pixels yMin := bbox.Box2D[0] * bounds.Dy() / 1000 xMin := bbox.Box2D[1] * bounds.Dx() / 1000 yMax := bbox.Box2D[2] * bounds.Dy() / 1000 xMax := bbox.Box2D[3] * bounds.Dx() / 1000 // draw rectangle border for x := xMin; x <= xMax; x++ { rgba.Set(x, yMin, red) rgba.Set(x, yMax, red) } for y := yMin; y <= yMax; y++ { rgba.Set(xMin, y, red) rgba.Set(xMax, y, red) } } return nil } func generateBoundingBoxesWithText(w io.Writer) error { ctx := context.Background() client, err := genai.NewClient(ctx, &genai.ClientConfig{ HTTPOptions: genai.HTTPOptions{APIVersion: "v1"}, }) if err != nil { return fmt.Errorf("failed to create genai client: %w", err) } imageURI := "https://storage.googleapis.com/generativeai-downloads/images/socks.jpg" // Schema definition for []BoundingBox schema := &genai.Schema{ Type: genai.TypeArray, Items: &genai.Schema{ Type: genai.TypeObject, Properties: map[string]*genai.Schema{ "box_2d": { Type: genai.TypeArray, Items: &genai.Schema{Type: genai.TypeInteger}, }, "label": {Type: genai.TypeString}, }, Required: []string{"box_2d", "label"}, }, } config := &genai.GenerateContentConfig{ SystemInstruction: &genai.Content{ Parts: []*genai.Part{{	Text: "Return bounding boxes as an array with labels. Never return masks. Limit to 25 objects.",	}}, }, Temperature: float32Ptr(0.5), ResponseMIMEType: "application/json", ResponseSchema: schema, SafetySettings: []*genai.SafetySetting{ { Category: genai.HarmCategoryDangerousContent, Threshold: genai.HarmBlockThresholdBlockOnlyHigh, }, }, } contents := []*genai.Content{ { Role: "user", Parts: []*genai.Part{ { FileData: &genai.FileData{ FileURI: imageURI, MIMEType: "image/jpeg", }, }, {Text: "Output the positions of the socks with a face. Label according to position in the image."}, }, }, } resp, err := client.Models.GenerateContent(ctx, "gemini-2.5-flash", contents, config) if err != nil { return fmt.Errorf("failed to generate content: %w", err) } fmt.Fprintln(w, resp.Text()) // Parse into []BoundingBox var boxes []BoundingBox if err := json.Unmarshal([]byte(resp.Text()), &boxes); err != nil { return fmt.Errorf("failed to parse bounding boxes: %w", err) } // Example response: //Box: (962,113)-(2158,1631) Label: top left sock with face //Box: (2656,721)-(3953,2976) Label: top right sock with face //... return plotBoundingBoxes(imageURI, boxes) } func float32Ptr(v float32) *float32 { return &v }