- Notifications
You must be signed in to change notification settings - Fork 63
docs: add a code sample for creating a kmeans model #267
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
69fe5d7 5fb1d4f 523255f 3bb267a 73d2a46 b3c0578 c25aeb5 db9f439 e7bd5ef 2a7d575 809ed05 5dba2b9 2207941 5e00a3c 7c64227 11678e0 f95cd9f 0df2dec 06a2490 1a9f7d9 464cf1c d03f46c 019e243 72174f9 50a447d ac348bf 7ce5337 29b2e1f 7762f0f 479a828 1572ddd 3d77ddd 505b790 4505c5c cad2185 3ab8220 816881c 9b382d6 ae9a362 5eb59ec File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -13,20 +13,26 @@ | |
| # limitations under the License. | ||
| | ||
| def test_kmeans_sample(): | ||
| <<<<<<< HEAD | ||
| # [START bigquery_dataframes_bqml_kmeans] | ||
SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| | ||
| import bigframes.pandas as bpd | ||
| import bigframes | ||
| from bigframes import dataframe | ||
SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| ======= | ||
| >>>>>>> 73d2a4681212c1881366c298f44f228bbf208932 | ||
| import datetime | ||
SalemJorden marked this conversation as resolved. Show resolved Hide resolved SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| | ||
| #NOTE: ask about line below and whether it is needed outside of colab notebooks | ||
| #bigframes.options.bigquery.project= "username-testing" | ||
| import bigframes | ||
| from bigframes import dataframe | ||
| import bigframes.pandas as bpd | ||
SalemJorden marked this conversation as resolved. Show resolved Hide resolved | ||
| | ||
| # NOTE: ask about line below and whether it is needed outside of colab notebooks | ||
SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| # bigframes.options.bigquery.project= "username-testing" | ||
| # read_gbq: Loads a DataFrame from BigQuery | ||
| h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire") | ||
SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| s= bpd.read_gbq( | ||
| ''' | ||
| s = bpd.read_gbq( | ||
tswast marked this conversation as resolved. Show resolved Hide resolved | ||
| """ | ||
| SELECT | ||
| id, | ||
| ST_DISTANCE( | ||
| | @@ -35,43 +41,61 @@ def test_kmeans_sample(): | |
| ) / 1000 AS distance_from_city_center | ||
| FROM | ||
| `bigquery-public-data.london_bicycles.cycle_stations` s | ||
| ''' | ||
| """ | ||
| ) | ||
| # transform the data | ||
| h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} ) | ||
| h= h[["start_date", "station_name", "station_id", "duration"]] | ||
| h = h.rename( | ||
| columns={"start_station_name": "station_name", "start_station_id": "station_id"} | ||
| ) | ||
| h = h[["start_date", "station_name", "station_id", "duration"]] | ||
| | ||
| # NOTE: line below is not accessed, is it needed outside of colab notebook? | ||
SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| start_date = datetime.datetime.now() | ||
| sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) | ||
| sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) | ||
| sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) | ||
| sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) | ||
| | ||
| h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] | ||
| h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] | ||
| | ||
| isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday", | ||
| 4:"weekday",5:"weekend", 6:"weekend"}) | ||
| isweekday = h.start_date.dt.dayofweek.map( | ||
| { | ||
| 0: "weekday", | ||
| 1: "weekday", | ||
| 2: "weekday", | ||
| 3: "weekday", | ||
| 4: "weekday", | ||
| 5: "weekend", | ||
| 6: "weekend", | ||
| } | ||
| ) | ||
| | ||
| # create the new dataframe variable, stationstats | ||
| merged_df = h.merge( | ||
| right= s, | ||
| how="inner", | ||
| left_on= "station_id", | ||
| right_on= "id", | ||
| ) | ||
| right=s, | ||
| how="inner", | ||
| left_on="station_id", | ||
| right_on="id", | ||
| ) | ||
| | ||
| stationstats = merged_df.groupby("station_name").agg( | ||
| ||
| {"duration": ["mean", "count"], "distance_from_city_center": "max"} | ||
| ) | ||
| | ||
| stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"}) | ||
| def station_filter(): | ||
SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| stationstats.columns = ["duration","num_trips","distance_from_city_center"] | ||
| stationstats.sort_values(by = "distance_from_city_center", ascending = True) | ||
| filter = '''REGEXP_CONTAINS(station_name, 'Kennington')''' | ||
| stationstats.columns = ["duration", "num_trips", "distance_from_city_center"] | ||
| stationstats.sort_values(by="distance_from_city_center", ascending=True) | ||
| filter = """REGEXP_CONTAINS(station_name, 'Kennington')""" | ||
SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| | ||
| # import the KMeans model to cluster the data | ||
| from bigframes.ml.cluster import KMeans | ||
| | ||
| cluster_model = KMeans(n_clusters=4) | ||
tswast marked this conversation as resolved. Show resolved Hide resolved | ||
| cluster_model= cluster_model.fit(stationstats) | ||
| cluster_model = cluster_model.fit(stationstats) | ||
SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| | ||
| #the following function predicts the cluster of every station that has the string "Kennington" in its name. | ||
| # the following function predicts the cluster of every station that has the string "Kennington" in its name. | ||
| def predict_kennington_stations(): | ||
SalemJorden marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| cluster_model.predict(stationstats) | ||
| <<<<<<< HEAD | ||
| | ||
| # [END bigquery_dataframes_bqml_kmeans] | ||
| # [END bigquery_dataframes_bqml_kmeans] | ||
| ======= | ||
| >>>>>>> 73d2a4681212c1881366c298f44f228bbf208932 | ||
Uh oh!
There was an error while loading. Please reload this page.