googleapis
diff --git a/‎bigframes/ml/llm.py‎
Lines changed: 61 additions & 0 deletions b/‎bigframes/ml/llm.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎tests/system/small/ml/conftest.py‎
Lines changed: 12 additions & 0 deletions b/‎tests/system/small/ml/conftest.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tests/system/small/ml/test_llm.py‎
Lines changed: 46 additions & 0 deletions b/‎tests/system/small/ml/test_llm.py‎
Lines changed: 46 additions & 0 deletions
@@ -732,6 +732,67 @@ def predict(
 
  return df
 
+ def score(
+ self,
+ X: Union[bpd.DataFrame, bpd.Series],
+ y: Union[bpd.DataFrame, bpd.Series],
+ task_type: Literal[
+ "text_generation", "classification", "summarization", "question_answering"
+ ] = "text_generation",
+ ) -> bpd.DataFrame:
+ """Calculate evaluation metrics of the model. Only "gemini-pro" model is supported for now.
+
+ .. note::
+
+ This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the
+ Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is"
+ and might have limited support. For more information, see the launch stage descriptions
+ (https://cloud.google.com/products#product-launch-stages).
+
+ .. note::
+
+ Output matches that of the BigQuery ML.EVALUTE function.
+ See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm
+ for the outputs relevant to this model type.
+
+ Args:
+ X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+ A BigQuery DataFrame as evaluation data, which contains only one column of input_text
+ that contains the prompt text to use when evaluating the model.
+ y (bigframes.dataframe.DataFrame or bigframes.series.Series):
+ A BigQuery DataFrame as evaluation labels, which contains only one column of output_text
+ that you would expect to be returned by the model.
+ task_type (str):
+ The type of the task for LLM model. Default to "text_generation".
+ Possible values: "text_generation", "classification", "summarization", and "question_answering".
+
+ Returns:
+ bigframes.dataframe.DataFrame: The DataFrame as evaluation result.
+ """
+ if not self._bqml_model:
+ raise RuntimeError("A model must be fitted before score")
+
+ # TODO(ashleyxu): Support gemini-1.5 when the rollout is ready. b/344891364.
+ if self._bqml_model.model_name.startswith("gemini-1.5"):
+ raise NotImplementedError("Score is not supported for gemini-1.5 model.")
+
+ X, y = utils.convert_to_dataframe(X, y)
+
+ if len(X.columns) != 1 or len(y.columns) != 1:
+ raise ValueError(
+ f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}"
+ )
+
+ # BQML identified the column by name
+ X_col_label = cast(blocks.Label, X.columns[0])
+ y_col_label = cast(blocks.Label, y.columns[0])
+ X = X.rename(columns={X_col_label: "input_text"})
+ y = y.rename(columns={y_col_label: "output_text"})
+
+ input_data = X.join(y, how="outer")
+
+ return self._bqml_model.llm_evaluate(input_data, task_type)
+
  def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator:
  """Save the model to BigQuery.
 
 
@@ -171,6 +171,18 @@ def llm_text_pandas_df():
  )
 
 
+@pytest.fixture(scope="session")
+def llm_fine_tune_df_default_index(
+ session: bigframes.Session,
+) -> bigframes.dataframe.DataFrame:
+ training_table_name = "llm_tuning.emotion_classification_train"
+ df = session.read_gbq(training_table_name)
+ prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: "
+ df["prompt"] = prefix + df["text"]
+ df["label"] = df["label"].astype("string")
+ return df
+
+
 @pytest.fixture(scope="session")
 def onnx_iris_pandas_df():
  """Data matching the iris dataset."""
 
@@ -15,6 +15,7 @@
 import pytest
 
 from bigframes.ml import llm
+from tests.system import utils
 
 
 def test_create_text_generator_model(
@@ -366,3 +367,48 @@ def test_gemini_text_generator_predict_with_params_success(
  assert "ml_generate_text_llm_result" in df.columns
  series = df["ml_generate_text_llm_result"]
  assert all(series.str.len() > 20)
+
+
+@pytest.mark.flaky(retries=2)
+def test_llm_gemini_pro_score(llm_fine_tune_df_default_index):
+ model = llm.GeminiTextGenerator(model_name="gemini-pro")
+
+ # Check score to ensure the model was fitted
+ score_result = model.score(
+ X=llm_fine_tune_df_default_index[["prompt"]],
+ y=llm_fine_tune_df_default_index[["label"]],
+ ).to_pandas()
+ utils.check_pandas_df_schema_and_index(
+ score_result,
+ columns=[
+ "bleu4_score",
+ "rouge-l_precision",
+ "rouge-l_recall",
+ "rouge-l_f1_score",
+ "evaluation_status",
+ ],
+ index=1,
+ )
+
+
+@pytest.mark.flaky(retries=2)
+def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index):
+ model = llm.GeminiTextGenerator(model_name="gemini-pro")
+
+ # Check score to ensure the model was fitted
+ score_result = model.score(
+ X=llm_fine_tune_df_default_index["prompt"],
+ y=llm_fine_tune_df_default_index["label"],
+ task_type="classification",
+ ).to_pandas()
+ utils.check_pandas_df_schema_and_index(
+ score_result,
+ columns=[
+ "precision",
+ "recall",
+ "f1_score",
+ "label",
+ "evaluation_status",
+ ],
+ index=6,
+ )