Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions bigframes/operations/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ def get_runtime_json_str(
def exif(
self,
*,
engine: Literal[None, "pillow"] = None,
connection: Optional[str] = None,
max_batching_rows: int = 8192,
container_cpu: Union[float, int] = 0.33,
Expand All @@ -311,6 +312,7 @@ def exif(
"""Extract EXIF data. Now only support image types.

Args:
engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session.
max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function.
container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
Expand All @@ -319,6 +321,8 @@ def exif(
Returns:
bigframes.series.Series: JSON series of key-value pairs.
"""
if engine is None or engine.casefold() != "pillow":
raise ValueError("Must specify the engine, supported value is 'pillow'.")

import bigframes.bigquery as bbq
import bigframes.blob._functions as blob_func
Expand All @@ -344,6 +348,7 @@ def image_blur(
self,
ksize: tuple[int, int],
*,
engine: Literal[None, "opencv"] = None,
dst: Optional[Union[str, bigframes.series.Series]] = None,
connection: Optional[str] = None,
max_batching_rows: int = 8192,
Expand All @@ -354,6 +359,7 @@ def image_blur(

Args:
ksize (tuple(int, int)): Kernel size.
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
str: GCS folder str. The output filenames are the same as the input files.
blob Series: The output file paths are determined by the uris of the blob Series.
Expand All @@ -367,6 +373,9 @@ def image_blur(
Returns:
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
"""
if engine is None or engine.casefold() != "opencv":
raise ValueError("Must specify the engine, supported value is 'opencv'.")

import bigframes.blob._functions as blob_func

connection = self._resolve_connection(connection)
Expand Down Expand Up @@ -424,6 +433,7 @@ def image_resize(
self,
dsize: tuple[int, int] = (0, 0),
*,
engine: Literal[None, "opencv"] = None,
fx: float = 0.0,
fy: float = 0.0,
dst: Optional[Union[str, bigframes.series.Series]] = None,
Expand All @@ -436,6 +446,7 @@ def image_resize(

Args:
dsize (tuple(int, int), default (0, 0)): Destination size. If set to 0, fx and fy parameters determine the size.
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
fx (float, default 0.0): scale factor along the horizontal axis. If set to 0.0, dsize parameter determines the output size.
fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size.
dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
Expand All @@ -451,6 +462,9 @@ def image_resize(
Returns:
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
"""
if engine is None or engine.casefold() != "opencv":
raise ValueError("Must specify the engine, supported value is 'opencv'.")

dsize_set = dsize[0] > 0 and dsize[1] > 0
fsize_set = fx > 0.0 and fy > 0.0
if not dsize_set ^ fsize_set:
Expand Down Expand Up @@ -516,6 +530,7 @@ def image_resize(
def image_normalize(
self,
*,
engine: Literal[None, "opencv"] = None,
alpha: float = 1.0,
beta: float = 0.0,
norm_type: str = "l2",
Expand All @@ -528,6 +543,7 @@ def image_normalize(
"""Normalize images.

Args:
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization.
beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization.
norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax".
Expand All @@ -544,6 +560,9 @@ def image_normalize(
Returns:
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
"""
if engine is None or engine.casefold() != "opencv":
raise ValueError("Must specify the engine, supported value is 'opencv'.")

import bigframes.blob._functions as blob_func

connection = self._resolve_connection(connection)
Expand Down Expand Up @@ -604,6 +623,7 @@ def image_normalize(
def pdf_extract(
self,
*,
engine: Literal[None, "pypdf"] = None,
connection: Optional[str] = None,
max_batching_rows: int = 1,
container_cpu: Union[float, int] = 2,
Expand All @@ -613,6 +633,7 @@ def pdf_extract(
"""Extracts text from PDF URLs and saves the text as string.

Args:
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
connection (str or None, default None): BQ connection used for
function internet transactions, and the output blob if "dst"
is str. If None, uses default connection of the session.
Expand All @@ -631,6 +652,9 @@ def pdf_extract(
Contains the extracted text from the PDF file.
Includes error messages if verbosity is enabled.
"""
if engine is None or engine.casefold() != "pypdf":
raise ValueError("Must specify the engine, supported value is 'pypdf'.")

import bigframes.bigquery as bbq
import bigframes.blob._functions as blob_func
import bigframes.pandas as bpd
Expand Down Expand Up @@ -663,6 +687,7 @@ def pdf_extract(
def pdf_chunk(
self,
*,
engine: Literal[None, "pypdf"] = None,
connection: Optional[str] = None,
chunk_size: int = 2000,
overlap_size: int = 200,
Expand All @@ -675,6 +700,7 @@ def pdf_chunk(
arrays of strings.

Args:
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
connection (str or None, default None): BQ connection used for
function internet transactions, and the output blob if "dst"
is str. If None, uses default connection of the session.
Expand All @@ -698,6 +724,8 @@ def pdf_chunk(
where each string is a chunk of text extracted from PDF.
Includes error messages if verbosity is enabled.
"""
if engine is None or engine.casefold() != "pypdf":
raise ValueError("Must specify the engine, supported value is 'pypdf'.")

import bigframes.bigquery as bbq
import bigframes.blob._functions as blob_func
Expand Down Expand Up @@ -740,6 +768,7 @@ def pdf_chunk(
def audio_transcribe(
self,
*,
engine: Literal["bigquery"] = "bigquery",
connection: Optional[str] = None,
model_name: Optional[
Literal[
Expand All @@ -753,6 +782,7 @@ def audio_transcribe(
Transcribe audio content using a Gemini multimodal model.

Args:
engine ('bigquery'): The engine (bigquery or third party library) used for the function.
connection (str or None, default None): BQ connection used for
function internet transactions, and the output blob if "dst"
is str. If None, uses default connection of the session.
Expand All @@ -770,6 +800,9 @@ def audio_transcribe(
Contains the transcribed text from the audio file.
Includes error messages if verbosity is enabled.
"""
if engine.casefold() != "bigquery":
raise ValueError("Must specify the engine, supported value is 'bigquery'.")

import bigframes.bigquery as bbq
import bigframes.ml.llm as llm
import bigframes.pandas as bpd
Expand Down
9 changes: 5 additions & 4 deletions notebooks/multimodal/multimodal_dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -254,16 +254,17 @@
"outputs": [],
"source": [
"df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n",
" (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\"\n",
" (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n",
")\n",
"df_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n",
" (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\"\n",
" (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\", engine=\"opencv\"\n",
")\n",
"df_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n",
" alpha=50.0,\n",
" beta=150.0,\n",
" norm_type=\"minmax\",\n",
" dst=f\"gs://{OUTPUT_BUCKET}/image_normalize_transformed/\",\n",
" engine=\"opencv\",\n",
")"
]
},
Expand All @@ -280,7 +281,7 @@
"outputs": [],
"source": [
"# You can also chain functions together\n",
"df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\")"
"df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")"
]
},
{
Expand Down Expand Up @@ -419,7 +420,7 @@
},
"outputs": [],
"source": [
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk()"
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
]
},
{
Expand Down
9 changes: 5 additions & 4 deletions samples/snippets/multimodal_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,22 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:

# [START bigquery_dataframes_multimodal_dataframe_image_transform]
df_image["blurred"] = df_image["image"].blob.image_blur(
(20, 20), dst=f"{dst_bucket}/image_blur_transformed/"
(20, 20), dst=f"{dst_bucket}/image_blur_transformed/", engine="opencv"
)
df_image["resized"] = df_image["image"].blob.image_resize(
(300, 200), dst=f"{dst_bucket}/image_resize_transformed/"
(300, 200), dst=f"{dst_bucket}/image_resize_transformed/", engine="opencv"
)
df_image["normalized"] = df_image["image"].blob.image_normalize(
alpha=50.0,
beta=150.0,
norm_type="minmax",
dst=f"{dst_bucket}/image_normalize_transformed/",
engine="opencv",
)

# You can also chain functions together
df_image["blur_resized"] = df_image["blurred"].blob.image_resize(
(300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/"
(300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/", engine="opencv"
)
df_image
# [END bigquery_dataframes_multimodal_dataframe_image_transform]
Expand Down Expand Up @@ -113,7 +114,7 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:
df_pdf = bpd.from_glob_path(
"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*", name="pdf"
)
df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk()
df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk(engine="pypdf")
chunked = df_pdf["chunked"].explode()
chunked
# [END bigquery_dataframes_multimodal_dataframe_pdf_chunk]
Expand Down
38 changes: 27 additions & 11 deletions tests/system/large/blob/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_blob_exif(
connection=bq_connection,
)

actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection)
actual = exif_image_df["blob_col"].blob.exif(engine="pillow", connection=bq_connection)
expected = bpd.Series(
['{"ExifOffset": 47, "Make": "MyCamera"}'],
session=session,
Expand All @@ -86,7 +86,7 @@ def test_blob_image_blur_to_series(
)

actual = images_mm_df["blob_col"].blob.image_blur(
(8, 8), dst=series, connection=bq_connection
(8, 8), dst=series, connection=bq_connection, engine="opencv"
)
expected_df = pd.DataFrame(
{
Expand Down Expand Up @@ -114,7 +114,7 @@ def test_blob_image_blur_to_folder(
images_output_uris: list[str],
):
actual = images_mm_df["blob_col"].blob.image_blur(
(8, 8), dst=images_output_folder, connection=bq_connection
(8, 8), dst=images_output_folder, connection=bq_connection, engine="opencv"
)
expected_df = pd.DataFrame(
{
Expand All @@ -136,7 +136,9 @@ def test_blob_image_blur_to_folder(


def test_blob_image_blur_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
actual = images_mm_df["blob_col"].blob.image_blur((8, 8), connection=bq_connection)
actual = images_mm_df["blob_col"].blob.image_blur(
(8, 8), connection=bq_connection, engine="opencv"
)

assert isinstance(actual, bpd.Series)
assert len(actual) == 2
Expand All @@ -154,7 +156,7 @@ def test_blob_image_resize_to_series(
)

actual = images_mm_df["blob_col"].blob.image_resize(
(200, 300), dst=series, connection=bq_connection
(200, 300), dst=series, connection=bq_connection, engine="opencv"
)
expected_df = pd.DataFrame(
{
Expand Down Expand Up @@ -182,7 +184,7 @@ def test_blob_image_resize_to_folder(
images_output_uris: list[str],
):
actual = images_mm_df["blob_col"].blob.image_resize(
(200, 300), dst=images_output_folder, connection=bq_connection
(200, 300), dst=images_output_folder, connection=bq_connection, engine="opencv"
)
expected_df = pd.DataFrame(
{
Expand All @@ -205,7 +207,7 @@ def test_blob_image_resize_to_folder(

def test_blob_image_resize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
actual = images_mm_df["blob_col"].blob.image_resize(
(200, 300), connection=bq_connection
(200, 300), connection=bq_connection, engine="opencv"
)

assert isinstance(actual, bpd.Series)
Expand All @@ -224,7 +226,12 @@ def test_blob_image_normalize_to_series(
)

actual = images_mm_df["blob_col"].blob.image_normalize(
alpha=50.0, beta=150.0, norm_type="minmax", dst=series, connection=bq_connection
alpha=50.0,
beta=150.0,
norm_type="minmax",
dst=series,
connection=bq_connection,
engine="opencv",
)
expected_df = pd.DataFrame(
{
Expand Down Expand Up @@ -257,6 +264,7 @@ def test_blob_image_normalize_to_folder(
norm_type="minmax",
dst=images_output_folder,
connection=bq_connection,
engine="opencv",
)
expected_df = pd.DataFrame(
{
Expand All @@ -279,7 +287,11 @@ def test_blob_image_normalize_to_folder(

def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
actual = images_mm_df["blob_col"].blob.image_normalize(
alpha=50.0, beta=150.0, norm_type="minmax", connection=bq_connection
alpha=50.0,
beta=150.0,
norm_type="minmax",
connection=bq_connection,
engine="opencv",
)

assert isinstance(actual, bpd.Series)
Expand Down Expand Up @@ -322,7 +334,7 @@ def test_blob_pdf_extract(
):
actual = (
pdf_mm_df["pdf"]
.blob.pdf_extract(connection=bq_connection, verbose=verbose)
.blob.pdf_extract(connection=bq_connection, verbose=verbose, engine="pypdf")
.explode()
.to_pandas()
)
Expand Down Expand Up @@ -373,7 +385,11 @@ def test_blob_pdf_chunk(
actual = (
pdf_mm_df["pdf"]
.blob.pdf_chunk(
connection=bq_connection, chunk_size=50, overlap_size=10, verbose=verbose
connection=bq_connection,
chunk_size=50,
overlap_size=10,
verbose=verbose,
engine="pypdf",
)
.explode()
.to_pandas()
Expand Down
Loading