Skip to content

Commit 7103427

Browse files
authored
Submit the BikeShare Dataset Project
1 parent a2463ad commit 7103427

File tree

1 file changed

+264
-0
lines changed

1 file changed

+264
-0
lines changed
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
#####
2+
##### Sondos Aabed Explores the Bokeshare Dataset
3+
#####
4+
5+
### Importing the necessary libraries
6+
import time
7+
import pandas as pd
8+
import numpy as np
9+
10+
#### this is the csv files dictionary
11+
CITY_DATA = { 'chicago': 'chicago.csv',
12+
'new york city': 'new_york_city.csv',
13+
'washington': 'washington.csv' }
14+
15+
#### in this method get the filters inputted by the user
16+
def get_filters():
17+
"""
18+
Asks user to specify a city, month, and day to analyze.
19+
20+
Returns:
21+
(str) city - name of the city to analyze
22+
(str) month - name of the month to filter by, or "all" to apply no month filter
23+
(str) day - name of the day of week to filter by, or "all" to apply no day filter
24+
"""
25+
print('\nHello! Let\'s explore some US bikeshare data!')
26+
#####
27+
# In those cases an invalid input is handled by asking the user to try again until it's true input
28+
####
29+
# get user input for city (chicago, new york city, washington).
30+
while True:
31+
city= input("\n Which City would like to explore? All, Chicago, New york city, Or Washington?\n")
32+
city=city.lower()
33+
if city not in ('all', 'new york city', 'chicago','washington'):
34+
print("Try to enter another city that is either: Chicago, New york city, Or Washington ")
35+
continue
36+
else:
37+
break
38+
39+
# get user input for month (all, january, february, ... , june)
40+
while True:
41+
month = input("\n In which of the months you want to explore? is it (all, january, february, ... , june)\n")
42+
month = month.lower()
43+
if month not in ('all','january','february','march','april','may','june','july','august','september','october','november','december'):
44+
print("Try to enter the month again, it wasn't a valid month!")
45+
continue
46+
else:
47+
break
48+
49+
# get user input for day of week (all, monday, tuesday, ... sunday)
50+
while True:
51+
day = input("\n What about the day you are looking for? is it (all, monday, tuesday, ... sunday)?\n")
52+
day = day.lower()
53+
if day not in ('sunday','monday','all','tuesday','wednesday','thursday','friday','saturday'):
54+
print("You entered a not valid day, try again")
55+
continue
56+
else:
57+
break
58+
59+
print('-'*40)
60+
return city, month, day
61+
62+
# in this method load the dataset based on which city the user inputs
63+
def load_data(city, month, day):
64+
"""
65+
Loads data for the specified city and filters by month and day if applicable.
66+
67+
Args:
68+
(str) city - name of the city to analyze
69+
(str) month - name of the month to filter by, or "all" to apply no month filter
70+
(str) day - name of the day of week to filter by, or "all" to apply no day filter
71+
Returns:
72+
df - Pandas DataFrame containing city data filtered by month and day
73+
"""
74+
# read the csv file using read_csv pandas based on the user input of cit
75+
# I have decided to add the option all because why not exploring all of them together giving a broader view
76+
if city not in ('all'):
77+
df = pd.read_csv(CITY_DATA[city])
78+
else:
79+
# for all dataframes if the user choses all combine them
80+
dfs = []
81+
for city, path in CITY_DATA.items(all):
82+
dfC = pd.read_csv(path)
83+
dfs.append(dfC)
84+
85+
df = pd.concat(dfs, ignore_index=True)
86+
## print(df)
87+
return df
88+
89+
## this metohd I created to clean the data
90+
## cleaning the data included handling missing data
91+
# also handle the high cardinality of dates
92+
def clean_data(df):
93+
df = handle_dates(df)
94+
df = handle_missing(df)
95+
return df
96+
97+
# this method I created to handle the missing data
98+
def handle_missing(df):
99+
# I chose to fill them with Unknown
100+
print('We have {} missing enteries'.format(df.isnull().sum().sum()) )
101+
# fill Nan values using fillna method
102+
df.fillna('Unknown', inplace=True)
103+
print('These were filled by (Unknown) ')
104+
return df
105+
106+
## this method I created to handle teh dates
107+
def handle_dates(df):
108+
"""
109+
Handle the dates as their datatypes using to_datetime pandas
110+
"""
111+
df['Start Time'] = pd.to_datetime(df['Start Time'])
112+
df['End Time'] = pd.to_datetime(df['End Time'])
113+
df['Birth Year'] = pd.to_datetime(df['Birth Year'])
114+
115+
## this coulmn has high cardinality so I better create new coulmns that I can filter by
116+
# Like the day of the week and the month and the year and the time
117+
df['start_month'] = df['Start Time'].dt.strftime('%B').str.lower()
118+
df['start_day'] = df['Start Time'].dt.strftime('%A').str.lower()
119+
df['start_year'] = df['Start Time'].dt.strftime('%Y')
120+
df['start_time'] = df['Start Time'].dt.strftime('%X')
121+
122+
df['end_month'] = df['End Time'].dt.strftime('%B').str.lower()
123+
df['end_day'] = df['End Time'].dt.strftime('%A').str.lower()
124+
df['end_year'] = df['End Time'].dt.strftime('%Y')
125+
df['end_time'] = df['End Time'].dt.strftime('%X')
126+
127+
# we have also the coulmn of Birth year
128+
# df['Birth Year'] = pd.to_datetime(df['Birth Year'], format='%Y')
129+
# this is not working for users stats
130+
# I have decided to handle this one as integer to get the min and max values
131+
df['Birth Year'] = pd.to_numeric(df['Birth Year'],errors='coerce' , downcast='integer')
132+
133+
# dropped them after I handeld them
134+
df.drop('Start Time', axis=1, inplace=True)
135+
df.drop('End Time', axis=1, inplace=True)
136+
137+
return df
138+
139+
# this method get the time travel frequent times
140+
# to get that I used the mode built-in method
141+
def time_stats(df):
142+
"""Displays statistics on the most frequent times of travel."""
143+
144+
print('\nCalculating The Most Frequent Times of Travel...\n')
145+
start_time = time.time()
146+
147+
# the most common month
148+
print('The most frequent month is: ', df['start_month'].mode()[0])
149+
150+
# the most common day of week
151+
print('The most frequent day is: ', df['start_day'].mode()[0])
152+
153+
# the most common start hour
154+
print('The most commoon start hour is: ', df['start_time'].mode()[0])
155+
156+
print("\nThis took %s seconds." % (time.time() - start_time))
157+
print('-'*40)
158+
159+
# in this method I get some statics about the stations of the trip
160+
# used mode and groupby
161+
def station_stats(df):
162+
"""Displays statistics on the most popular stations and trip."""
163+
164+
print('\nCalculating The Most Popular Stations and Trip...\n')
165+
start_time = time.time()
166+
167+
# most commonly used start station
168+
print('The most commonly used start station is: ', df['Start Station'].mode()[0] )
169+
170+
# most commonly used end station
171+
print('The most commonly used end station is: ', df['End Station'].mode()[0] )
172+
173+
# most frequent combination of start station and end station trip
174+
print('The most frequent combination of start station and end station trip is: ',
175+
df.groupby(['Start Station','End Station']).size().idxmax())
176+
177+
print("\nThis took %s seconds." % (time.time() - start_time))
178+
print('-'*40)
179+
180+
# In this method I get some statics about the trip duration
181+
# used the sum, mean aggregation functions
182+
def trip_duration_stats(df):
183+
"""Displays statistics on the total and average trip duration."""
184+
185+
print('\nCalculating Trip Duration...\n')
186+
start_time = time.time()
187+
188+
# total travel time
189+
# the trip duration coulmn is in seconds
190+
# to make it more readable I convert it to days by dividing it on 86400
191+
print('The total travel time in hours is: ', df['Trip Duration'].sum()/86400)
192+
193+
# mean travel time
194+
print('The average travel time in minutes is: ', df['Trip Duration'].mean()/60)
195+
196+
print("\nThis took %s seconds." % (time.time() - start_time))
197+
print('-'*40)
198+
199+
# In this method I get some statics about the users
200+
# Using
201+
def user_stats(df):
202+
"""Displays statistics on bikeshare users."""
203+
204+
print('\nCalculating User Stats...\n')
205+
start_time = time.time()
206+
207+
# counts of user types
208+
print('In this city, we have diffrent types of users as follows: ')
209+
print(df['User Type'].value_counts())
210+
211+
# counts users based on gender
212+
print('The total count of each gender is as follow: ')
213+
print('Females:', df['Gender'].value_counts().get("Female", 0))
214+
print('Males:', df['Gender'].value_counts().get("Male", 0))
215+
print('Unknown:', df['Gender'].value_counts().get("Unknown", 0))
216+
217+
# So because I don't want to include the unknown value of these I will use a filter on the dataset
218+
# earliest year of birth
219+
print('The earliest year of birth is: ', df['Birth Year'].min())
220+
221+
# Something doesn't add up here because it first displays to me the (unknown) so because I used it to fill the missing data
222+
# I am thinking to impute the missing birth year with the mode of it
223+
# but this will effect the time since I already imputed why impute twice
224+
# so what can I do ?
225+
226+
# most recent of birth
227+
print('The most recent year of birth is: ', df['Birth Year'].max())
228+
229+
# most common year of birth
230+
print('The most common year of birth is: ', df['Birth Year'].mode()[0])
231+
232+
print("\nThis took %s seconds." % (time.time() - start_time))
233+
print('-'*40)
234+
235+
def main():
236+
# start the program until the user hits no
237+
while True:
238+
# gets the filters
239+
city, month, day = get_filters()
240+
241+
# load the dataset
242+
df = load_data(city, month, day)
243+
244+
# clean the dataset
245+
df= clean_data(df)
246+
247+
# Display diffrent statics of the dataset
248+
time_stats(df)
249+
station_stats(df)
250+
trip_duration_stats(df)
251+
user_stats(df)
252+
253+
restart = input('\nWould you like to restart? Enter yes or no.\n')
254+
if restart.lower() != 'yes':
255+
break
256+
257+
############################
258+
259+
# In this project the dataset of diffrent city is explored
260+
# by the user interactivly of diffrent cities
261+
262+
############################
263+
if __name__ == "__main__":
264+
main()

0 commit comments

Comments
 (0)