1+ #####
2+ ##### Sondos Aabed Explores the Bokeshare Dataset
3+ #####
4+
5+ ### Importing the necessary libraries
6+ import time
7+ import pandas as pd
8+ import numpy as np
9+
10+ #### this is the csv files dictionary
11+ CITY_DATA = { 'chicago' : 'chicago.csv' ,
12+ 'new york city' : 'new_york_city.csv' ,
13+ 'washington' : 'washington.csv' }
14+
15+ #### in this method get the filters inputted by the user
16+ def get_filters ():
17+ """
18+ Asks user to specify a city, month, and day to analyze.
19+
20+ Returns:
21+ (str) city - name of the city to analyze
22+ (str) month - name of the month to filter by, or "all" to apply no month filter
23+ (str) day - name of the day of week to filter by, or "all" to apply no day filter
24+ """
25+ print ('\n Hello! Let\' s explore some US bikeshare data!' )
26+ #####
27+ # In those cases an invalid input is handled by asking the user to try again until it's true input
28+ ####
29+ # get user input for city (chicago, new york city, washington).
30+ while True :
31+ city = input ("\n Which City would like to explore? All, Chicago, New york city, Or Washington?\n " )
32+ city = city .lower ()
33+ if city not in ('all' , 'new york city' , 'chicago' ,'washington' ):
34+ print ("Try to enter another city that is either: Chicago, New york city, Or Washington " )
35+ continue
36+ else :
37+ break
38+
39+ # get user input for month (all, january, february, ... , june)
40+ while True :
41+ month = input ("\n In which of the months you want to explore? is it (all, january, february, ... , june)\n " )
42+ month = month .lower ()
43+ if month not in ('all' ,'january' ,'february' ,'march' ,'april' ,'may' ,'june' ,'july' ,'august' ,'september' ,'october' ,'november' ,'december' ):
44+ print ("Try to enter the month again, it wasn't a valid month!" )
45+ continue
46+ else :
47+ break
48+
49+ # get user input for day of week (all, monday, tuesday, ... sunday)
50+ while True :
51+ day = input ("\n What about the day you are looking for? is it (all, monday, tuesday, ... sunday)?\n " )
52+ day = day .lower ()
53+ if day not in ('sunday' ,'monday' ,'all' ,'tuesday' ,'wednesday' ,'thursday' ,'friday' ,'saturday' ):
54+ print ("You entered a not valid day, try again" )
55+ continue
56+ else :
57+ break
58+
59+ print ('-' * 40 )
60+ return city , month , day
61+
62+ # in this method load the dataset based on which city the user inputs
63+ def load_data (city , month , day ):
64+ """
65+ Loads data for the specified city and filters by month and day if applicable.
66+
67+ Args:
68+ (str) city - name of the city to analyze
69+ (str) month - name of the month to filter by, or "all" to apply no month filter
70+ (str) day - name of the day of week to filter by, or "all" to apply no day filter
71+ Returns:
72+ df - Pandas DataFrame containing city data filtered by month and day
73+ """
74+ # read the csv file using read_csv pandas based on the user input of cit
75+ # I have decided to add the option all because why not exploring all of them together giving a broader view
76+ if city not in ('all' ):
77+ df = pd .read_csv (CITY_DATA [city ])
78+ else :
79+ # for all dataframes if the user choses all combine them
80+ dfs = []
81+ for city , path in CITY_DATA .items (all ):
82+ dfC = pd .read_csv (path )
83+ dfs .append (dfC )
84+
85+ df = pd .concat (dfs , ignore_index = True )
86+ ## print(df)
87+ return df
88+
89+ ## this metohd I created to clean the data
90+ ## cleaning the data included handling missing data
91+ # also handle the high cardinality of dates
92+ def clean_data (df ):
93+ df = handle_dates (df )
94+ df = handle_missing (df )
95+ return df
96+
97+ # this method I created to handle the missing data
98+ def handle_missing (df ):
99+ # I chose to fill them with Unknown
100+ print ('We have {} missing enteries' .format (df .isnull ().sum ().sum ()) )
101+ # fill Nan values using fillna method
102+ df .fillna ('Unknown' , inplace = True )
103+ print ('These were filled by (Unknown) ' )
104+ return df
105+
106+ ## this method I created to handle teh dates
107+ def handle_dates (df ):
108+ """
109+ Handle the dates as their datatypes using to_datetime pandas
110+ """
111+ df ['Start Time' ] = pd .to_datetime (df ['Start Time' ])
112+ df ['End Time' ] = pd .to_datetime (df ['End Time' ])
113+ df ['Birth Year' ] = pd .to_datetime (df ['Birth Year' ])
114+
115+ ## this coulmn has high cardinality so I better create new coulmns that I can filter by
116+ # Like the day of the week and the month and the year and the time
117+ df ['start_month' ] = df ['Start Time' ].dt .strftime ('%B' ).str .lower ()
118+ df ['start_day' ] = df ['Start Time' ].dt .strftime ('%A' ).str .lower ()
119+ df ['start_year' ] = df ['Start Time' ].dt .strftime ('%Y' )
120+ df ['start_time' ] = df ['Start Time' ].dt .strftime ('%X' )
121+
122+ df ['end_month' ] = df ['End Time' ].dt .strftime ('%B' ).str .lower ()
123+ df ['end_day' ] = df ['End Time' ].dt .strftime ('%A' ).str .lower ()
124+ df ['end_year' ] = df ['End Time' ].dt .strftime ('%Y' )
125+ df ['end_time' ] = df ['End Time' ].dt .strftime ('%X' )
126+
127+ # we have also the coulmn of Birth year
128+ # df['Birth Year'] = pd.to_datetime(df['Birth Year'], format='%Y')
129+ # this is not working for users stats
130+ # I have decided to handle this one as integer to get the min and max values
131+ df ['Birth Year' ] = pd .to_numeric (df ['Birth Year' ],errors = 'coerce' , downcast = 'integer' )
132+
133+ # dropped them after I handeld them
134+ df .drop ('Start Time' , axis = 1 , inplace = True )
135+ df .drop ('End Time' , axis = 1 , inplace = True )
136+
137+ return df
138+
139+ # this method get the time travel frequent times
140+ # to get that I used the mode built-in method
141+ def time_stats (df ):
142+ """Displays statistics on the most frequent times of travel."""
143+
144+ print ('\n Calculating The Most Frequent Times of Travel...\n ' )
145+ start_time = time .time ()
146+
147+ # the most common month
148+ print ('The most frequent month is: ' , df ['start_month' ].mode ()[0 ])
149+
150+ # the most common day of week
151+ print ('The most frequent day is: ' , df ['start_day' ].mode ()[0 ])
152+
153+ # the most common start hour
154+ print ('The most commoon start hour is: ' , df ['start_time' ].mode ()[0 ])
155+
156+ print ("\n This took %s seconds." % (time .time () - start_time ))
157+ print ('-' * 40 )
158+
159+ # in this method I get some statics about the stations of the trip
160+ # used mode and groupby
161+ def station_stats (df ):
162+ """Displays statistics on the most popular stations and trip."""
163+
164+ print ('\n Calculating The Most Popular Stations and Trip...\n ' )
165+ start_time = time .time ()
166+
167+ # most commonly used start station
168+ print ('The most commonly used start station is: ' , df ['Start Station' ].mode ()[0 ] )
169+
170+ # most commonly used end station
171+ print ('The most commonly used end station is: ' , df ['End Station' ].mode ()[0 ] )
172+
173+ # most frequent combination of start station and end station trip
174+ print ('The most frequent combination of start station and end station trip is: ' ,
175+ df .groupby (['Start Station' ,'End Station' ]).size ().idxmax ())
176+
177+ print ("\n This took %s seconds." % (time .time () - start_time ))
178+ print ('-' * 40 )
179+
180+ # In this method I get some statics about the trip duration
181+ # used the sum, mean aggregation functions
182+ def trip_duration_stats (df ):
183+ """Displays statistics on the total and average trip duration."""
184+
185+ print ('\n Calculating Trip Duration...\n ' )
186+ start_time = time .time ()
187+
188+ # total travel time
189+ # the trip duration coulmn is in seconds
190+ # to make it more readable I convert it to days by dividing it on 86400
191+ print ('The total travel time in hours is: ' , df ['Trip Duration' ].sum ()/ 86400 )
192+
193+ # mean travel time
194+ print ('The average travel time in minutes is: ' , df ['Trip Duration' ].mean ()/ 60 )
195+
196+ print ("\n This took %s seconds." % (time .time () - start_time ))
197+ print ('-' * 40 )
198+
199+ # In this method I get some statics about the users
200+ # Using
201+ def user_stats (df ):
202+ """Displays statistics on bikeshare users."""
203+
204+ print ('\n Calculating User Stats...\n ' )
205+ start_time = time .time ()
206+
207+ # counts of user types
208+ print ('In this city, we have diffrent types of users as follows: ' )
209+ print (df ['User Type' ].value_counts ())
210+
211+ # counts users based on gender
212+ print ('The total count of each gender is as follow: ' )
213+ print ('Females:' , df ['Gender' ].value_counts ().get ("Female" , 0 ))
214+ print ('Males:' , df ['Gender' ].value_counts ().get ("Male" , 0 ))
215+ print ('Unknown:' , df ['Gender' ].value_counts ().get ("Unknown" , 0 ))
216+
217+ # So because I don't want to include the unknown value of these I will use a filter on the dataset
218+ # earliest year of birth
219+ print ('The earliest year of birth is: ' , df ['Birth Year' ].min ())
220+
221+ # Something doesn't add up here because it first displays to me the (unknown) so because I used it to fill the missing data
222+ # I am thinking to impute the missing birth year with the mode of it
223+ # but this will effect the time since I already imputed why impute twice
224+ # so what can I do ?
225+
226+ # most recent of birth
227+ print ('The most recent year of birth is: ' , df ['Birth Year' ].max ())
228+
229+ # most common year of birth
230+ print ('The most common year of birth is: ' , df ['Birth Year' ].mode ()[0 ])
231+
232+ print ("\n This took %s seconds." % (time .time () - start_time ))
233+ print ('-' * 40 )
234+
235+ def main ():
236+ # start the program until the user hits no
237+ while True :
238+ # gets the filters
239+ city , month , day = get_filters ()
240+
241+ # load the dataset
242+ df = load_data (city , month , day )
243+
244+ # clean the dataset
245+ df = clean_data (df )
246+
247+ # Display diffrent statics of the dataset
248+ time_stats (df )
249+ station_stats (df )
250+ trip_duration_stats (df )
251+ user_stats (df )
252+
253+ restart = input ('\n Would you like to restart? Enter yes or no.\n ' )
254+ if restart .lower () != 'yes' :
255+ break
256+
257+ ############################
258+
259+ # In this project the dataset of diffrent city is explored
260+ # by the user interactivly of diffrent cities
261+
262+ ############################
263+ if __name__ == "__main__" :
264+ main ()
0 commit comments