|
12 | 12 | # See the License for the specific language governing permissions and |
13 | 13 | # limitations under the License. |
14 | 14 |
|
| 15 | +import math |
| 16 | + |
15 | 17 | import pandas as pd |
16 | 18 | import pytest |
17 | 19 |
|
@@ -302,3 +304,174 @@ def test_train_test_split_stratify(df_fixture, request): |
302 | 304 | test_counts, |
303 | 305 | check_index_type=False, |
304 | 306 | ) |
| 307 | + |
| 308 | + |
| 309 | +@pytest.mark.parametrize( |
| 310 | + "n_splits", |
| 311 | + (3, 5, 10), |
| 312 | +) |
| 313 | +def test_KFold_get_n_splits(n_splits): |
| 314 | + kf = model_selection.KFold(n_splits) |
| 315 | + assert kf.get_n_splits() == n_splits |
| 316 | + |
| 317 | + |
| 318 | +@pytest.mark.parametrize( |
| 319 | + "df_fixture", |
| 320 | + ("penguins_df_default_index", "penguins_df_null_index"), |
| 321 | +) |
| 322 | +@pytest.mark.parametrize( |
| 323 | + "n_splits", |
| 324 | + (3, 5), |
| 325 | +) |
| 326 | +def test_KFold_split(df_fixture, n_splits, request): |
| 327 | + df = request.getfixturevalue(df_fixture) |
| 328 | + |
| 329 | + kf = model_selection.KFold(n_splits=n_splits) |
| 330 | + |
| 331 | + X = df[ |
| 332 | + [ |
| 333 | + "species", |
| 334 | + "island", |
| 335 | + "culmen_length_mm", |
| 336 | + ] |
| 337 | + ] |
| 338 | + y = df["body_mass_g"] |
| 339 | + |
| 340 | + len_test_upper, len_test_lower = math.ceil(len(df) / n_splits), math.floor( |
| 341 | + len(df) / n_splits |
| 342 | + ) |
| 343 | + len_train_upper, len_train_lower = ( |
| 344 | + len(df) - len_test_lower, |
| 345 | + len(df) - len_test_upper, |
| 346 | + ) |
| 347 | + |
| 348 | + for X_train, X_test, y_train, y_test in kf.split(X, y): # type: ignore |
| 349 | + assert isinstance(X_train, bpd.DataFrame) |
| 350 | + assert isinstance(X_test, bpd.DataFrame) |
| 351 | + assert isinstance(y_train, bpd.Series) |
| 352 | + assert isinstance(y_test, bpd.Series) |
| 353 | + |
| 354 | + # Depend on the iteration, train/test can +-1 in size. |
| 355 | + assert ( |
| 356 | + X_train.shape == (len_train_upper, 3) |
| 357 | + and y_train.shape == (len_train_upper,) |
| 358 | + and X_test.shape == (len_test_lower, 3) |
| 359 | + and y_test.shape == (len_test_lower,) |
| 360 | + ) or ( |
| 361 | + X_train.shape == (len_train_lower, 3) |
| 362 | + and y_train.shape == (len_train_lower,) |
| 363 | + and X_test.shape == (len_test_upper, 3) |
| 364 | + and y_test.shape == (len_test_upper,) |
| 365 | + ) |
| 366 | + |
| 367 | + |
| 368 | +@pytest.mark.parametrize( |
| 369 | + "df_fixture", |
| 370 | + ("penguins_df_default_index", "penguins_df_null_index"), |
| 371 | +) |
| 372 | +@pytest.mark.parametrize( |
| 373 | + "n_splits", |
| 374 | + (3, 5), |
| 375 | +) |
| 376 | +def test_KFold_split_X_only(df_fixture, n_splits, request): |
| 377 | + df = request.getfixturevalue(df_fixture) |
| 378 | + |
| 379 | + kf = model_selection.KFold(n_splits=n_splits) |
| 380 | + |
| 381 | + X = df[ |
| 382 | + [ |
| 383 | + "species", |
| 384 | + "island", |
| 385 | + "culmen_length_mm", |
| 386 | + ] |
| 387 | + ] |
| 388 | + |
| 389 | + len_test_upper, len_test_lower = math.ceil(len(df) / n_splits), math.floor( |
| 390 | + len(df) / n_splits |
| 391 | + ) |
| 392 | + len_train_upper, len_train_lower = ( |
| 393 | + len(df) - len_test_lower, |
| 394 | + len(df) - len_test_upper, |
| 395 | + ) |
| 396 | + |
| 397 | + for X_train, X_test, y_train, y_test in kf.split(X, y=None): # type: ignore |
| 398 | + assert isinstance(X_train, bpd.DataFrame) |
| 399 | + assert isinstance(X_test, bpd.DataFrame) |
| 400 | + assert y_train is None |
| 401 | + assert y_test is None |
| 402 | + |
| 403 | + # Depend on the iteration, train/test can +-1 in size. |
| 404 | + assert ( |
| 405 | + X_train.shape == (len_train_upper, 3) |
| 406 | + and X_test.shape == (len_test_lower, 3) |
| 407 | + ) or ( |
| 408 | + X_train.shape == (len_train_lower, 3) |
| 409 | + and X_test.shape == (len_test_upper, 3) |
| 410 | + ) |
| 411 | + |
| 412 | + |
| 413 | +def test_KFold_seeded_correct_rows(session, penguins_pandas_df_default_index): |
| 414 | + kf = model_selection.KFold(random_state=42) |
| 415 | + # Note that we're using `penguins_pandas_df_default_index` as this test depends |
| 416 | + # on a stable row order being present end to end |
| 417 | + # filter down to the chunkiest penguins, to keep our test code a reasonable size |
| 418 | + all_data = penguins_pandas_df_default_index[ |
| 419 | + penguins_pandas_df_default_index.body_mass_g > 5500 |
| 420 | + ] |
| 421 | + |
| 422 | + # Note that bigframes loses the index if it doesn't have a name |
| 423 | + all_data.index.name = "rowindex" |
| 424 | + |
| 425 | + df = session.read_pandas(all_data) |
| 426 | + |
| 427 | + X = df[ |
| 428 | + [ |
| 429 | + "species", |
| 430 | + "island", |
| 431 | + "culmen_length_mm", |
| 432 | + ] |
| 433 | + ] |
| 434 | + y = df["body_mass_g"] |
| 435 | + X_train, X_test, y_train, y_test = next(kf.split(X, y)) # type: ignore |
| 436 | + |
| 437 | + X_train_sorted = X_train.to_pandas().sort_index() |
| 438 | + X_test_sorted = X_test.to_pandas().sort_index() |
| 439 | + y_train_sorted = y_train.to_pandas().sort_index() |
| 440 | + y_test_sorted = y_test.to_pandas().sort_index() |
| 441 | + |
| 442 | + train_index: pd.Index = pd.Index( |
| 443 | + [ |
| 444 | + 144, |
| 445 | + 146, |
| 446 | + 148, |
| 447 | + 161, |
| 448 | + 168, |
| 449 | + 183, |
| 450 | + 217, |
| 451 | + 221, |
| 452 | + 225, |
| 453 | + 226, |
| 454 | + 237, |
| 455 | + 244, |
| 456 | + 257, |
| 457 | + 262, |
| 458 | + 264, |
| 459 | + 266, |
| 460 | + 267, |
| 461 | + 269, |
| 462 | + 278, |
| 463 | + 289, |
| 464 | + 290, |
| 465 | + 291, |
| 466 | + ], |
| 467 | + dtype="Int64", |
| 468 | + name="rowindex", |
| 469 | + ) |
| 470 | + test_index: pd.Index = pd.Index( |
| 471 | + [186, 240, 245, 260, 263, 268], dtype="Int64", name="rowindex" |
| 472 | + ) |
| 473 | + |
| 474 | + pd.testing.assert_index_equal(X_train_sorted.index, train_index) |
| 475 | + pd.testing.assert_index_equal(X_test_sorted.index, test_index) |
| 476 | + pd.testing.assert_index_equal(y_train_sorted.index, train_index) |
| 477 | + pd.testing.assert_index_equal(y_test_sorted.index, test_index) |
0 commit comments