# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# to restrict the float value to 3 decimal places
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# renders the plot directly within the notebook
%matplotlib inline

# import and ignore warnings
import warnings
warnings.filterwarnings("ignore")

# mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

# read the data
df = pd.read_csv('/content/drive/MyDrive/MIT - ADSP/FoodHub Project/foodhub_order.csv')
# returns the first 5 rows
df.head()

# returns the last 5 rows
df.tail()

# returns the dimensions of the DataFrame
df.shape

(1898, 9)

# returns the dataypes for each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB

# returns total count of missing/ null values
df.isnull().sum()

# returns total count of missing/ NaN values
df.isna().sum()

# returns a statistical summary of all the data and transposes the columns for readability
df.describe(include='all').T

# returns a count of all unique values in the 'Rating' column
df['rating'].value_counts()

# filters a list from the 'Rating' column with values that are labeled 'Not given' and returns a total count
df['rating'][df['rating'] == 'Not given'].count()

736

# plot a histogram for the distribution of the 'Order ID' column
sns.histplot(data=df, x='order_id')
plt.xlabel('Order ID') #labels the x axis
plt.ylabel('Frequency') #labels the y axis
plt.title('Distribution of Order IDs') #gives the graph a title
plt.show() #displays the graph

# plot a boxplot for the distribution of the 'Order ID' column
sns.boxplot(data=df, x='order_id')
plt.xlabel('Order ID')
plt.show()

# plot a histogram for the distribution of the 'Customer ID' column
sns.histplot(data=df, x='customer_id')
plt.xlabel('Customer ID')
plt.ylabel('Frequency')
plt.title('Distribution of Customer IDs')
plt.show()

# plot a boxplot for the distribution of the 'Customer ID' column
sns.boxplot(data=df, x='customer_id')
plt.xlabel('Customer ID')
plt.show()

# plot a bar graph to show counts of each bin of categorical variable(restaurant name)
plt.figure(figsize=(45,10)) #sets graph size
sns.countplot(data=df, x='restaurant_name', hue='restaurant_name')
plt.title('Orders by Restaurant Name')
plt.xlabel('Restaurant Name')
plt.ylabel('Orders')
plt.xticks(rotation=90) #rotates x axis lables 90 degrees
plt.show()

# plot a bar graph to show counts of orders by cuisine type
plt.figure(figsize=(15,7))
sns.countplot(data=df, x='cuisine_type', hue='cuisine_type')
plt.title('Orders by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Orders')
plt.show()

# calculates proportion of restaurants grouped by cuisine types into percentages
df['cuisine_type'].value_counts(normalize=True)*100

# plot a histogram of the distribution of the 'Cost of the Order' column
sns.histplot(data=df, x='cost_of_the_order')
plt.xlabel('Cost of the Order')
plt.ylabel('Frequency')
plt.title('Distribution of Order Cost')
plt.show()

# plot a boxplot of the distribution of the 'Cost of the Order' column
sns.boxplot(data=df, x='cost_of_the_order')
plt.xlabel('Cost of the Order')
plt.show()

# plot a bar graph to show counts of orders by day of week
sns.countplot(data=df, x='day_of_the_week', hue='day_of_the_week')
plt.title('Orders by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Orders')
plt.show()

# plot a bar graph of orders by Rating
sns.countplot(data=df, x='rating', hue='rating', order=df['rating'].value_counts().index)
plt.title('Orders by Rating')
plt.xlabel('Rating')
plt.ylabel('Orders')
plt.show()

# plot a histogram of the distribution of the 'Food Preparation Time' column
sns.histplot(data=df, x='food_preparation_time')
plt.title('Distribution of Food Preparation Time')
plt.xlabel('Food Preparation Time(minutes)')
plt.ylabel('Orders')
plt.show()

# plot a boxplot of the distribution of the 'Food Preparation Time' column
sns.boxplot(data=df, x='food_preparation_time')
plt.xlabel('Food Preparation Time(minutes)')
plt.show()

# plot a histogram of the distribution of the 'Delivery Time' column
sns.histplot(data=df, x='delivery_time')
plt.title('Distribution of Delivery Time')
plt.xlabel('Delivery Time(minutes)')
plt.ylabel('Count')
plt.show()

# plot a boxplot of the distribution of the 'Delivery Time' column
sns.boxplot(data=df, x='delivery_time')
plt.xlabel('Delivery Time(minutes)')
plt.show()

# returns top 5 restaurants by order received
df['restaurant_name'].value_counts().head(5)

# returns Cuisine Type grouped by Day of the Week in descending order
df.groupby('day_of_the_week')['cuisine_type'].value_counts()

# plot a bar graph of orders by Cuisine Type, separate by day of week
plt.figure(figsize=(20,5))
sns.countplot(data=df, x='cuisine_type', hue='day_of_the_week', order=df['cuisine_type'].value_counts().index)
plt.title('Orders by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Orders')
plt.legend(title=None) #removes legend title
plt.show()

# filter the DataFrame to orders costing more than $20
# divide the filtered list by the total numbers of orders and convert to percentage
df.loc[df['cost_of_the_order']>20].value_counts().sum() / df['cost_of_the_order'].value_counts().sum()*100

29.24130663856691

# averages the values from the Delivery Time column
df['delivery_time'].mean()

24.161749209694417

# counts unique values in Customer ID column and returns the top 3
df['customer_id'].value_counts().head(3)

# plot a line graph showing the relationship between Cost of Order and Delivery Time
plt.figure(figsize=(10,5))
sns.lineplot(data=df, x='delivery_time', y='cost_of_the_order', errorbar=None)
plt.title('Cost of Order vs Delivery Time')
plt.xlabel('Delivery Time (minutes)')
plt.ylabel('Cost (dollars)')
plt.show()

# plot a line graph showing the relationship between Food Preparation Time and Delivery Time
plt.figure(figsize=(10,5))
sns.lineplot(data=df, x='delivery_time', y='food_preparation_time', errorbar=None)
plt.title('Food Preparation Time vs Delivery Time')
plt.xlabel('Delivery Time (minutes)')
plt.ylabel('Food Preparation Time (minutes)')
plt.show()

# plot a line graph showing the relationship between Cost of Order and Food Preparation Time
plt.figure(figsize=(10,5))
sns.lineplot(data=df, x='food_preparation_time', y='cost_of_the_order', errorbar=None)
plt.title('Cost of Order vs Food Preparation Time')
plt.xlabel('Food Preparation Time (minutes)')
plt.ylabel('Cost (dollars)')
plt.show()

# plot the distributions of Price by Cuisine Type
plt.figure(figsize=(15,7))
sns.boxplot(data=df, x='cuisine_type', y='cost_of_the_order', hue='cuisine_type')
plt.title('Distribution of Price by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Price')
plt.show()

# plot the distributions of Food Preparation Time by Cuisine Type
plt.figure(figsize=(15,7))
sns.boxplot(data=df, x='cuisine_type', y='food_preparation_time', hue='cuisine_type')
plt.title('Distribution of Food Preparation Time by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Food Preparation Time')
plt.show()

# convert Rating column to numeric type and replace 'Not given' values with 'NaN'
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# plot a line graph showing the relationship between Delivery Time and Rating by Day of the Week
plt.figure(figsize=(10,5))
sns.lineplot(data=df, x='delivery_time', y='rating', hue='day_of_the_week', errorbar=None)
plt.title('Rating vs Delivery Time by Day of the Week')
plt.xlabel('Delivery Time (minutes)')
plt.ylabel('Rating')
plt.legend(title=None)
plt.show()

# create a new 'Total tTime' column by adding together 'Delivery Time' and 'Food Preparation Time'
df['total_time'] = df['delivery_time'] + df['food_preparation_time']

# plot a line graph showing the relationship between Total Time and Rating by Day of the Week
plt.figure(figsize=(10,5))
sns.lineplot(data=df, x='total_time', y='rating', hue='day_of_the_week', errorbar=None)
plt.title('Rating vs Total Time by Day of the Week')
plt.xlabel('Total Time (minutes)')
plt.ylabel('Rating')
plt.legend(title=None)
plt.show()

# plot a line graph showing the relationship between Rating and Cost by Day of the Week
sns.lineplot(data=df, x='rating', y='cost_of_the_order', hue='day_of_the_week', errorbar=None)
plt.title('Rating vs Cost of Order by Day of the Week')
plt.xlabel('Rating')
plt.ylabel('Order Cost (dollars)')
plt.legend(title=None)
plt.show()

# plot a bar graph of orders by Cuisine Type, separate by day of week
plt.figure(figsize=(20,5))
sns.countplot(data=df, x='cuisine_type', hue='day_of_the_week', order=df['cuisine_type'].value_counts().index)
plt.title('Orders by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Orders')
plt.legend(title=None)
plt.show()

# plot a violon plot to show the density and distribution of Price by Cuisine Type
plt.figure(figsize=(20,10))
sns.violinplot(data=df, x='cuisine_type', y='cost_of_the_order', hue='cuisine_type')
plt.title('Distribution of Price by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Price')
plt.show()

# plot a bar graph of Ratings by Cuisine Type
plt.figure(figsize=(15,7))
sns.barplot(data=df, x='cuisine_type', y='rating', hue='cuisine_type')
plt.title('Rating by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Rating')
plt.show()

# plot a heatmap of numerical columns to see correlation
sns.heatmap(df.select_dtypes(include=np.number).corr(),annot=True, cmap='Spectral',vmin=-1,vmax=1)
plt.title('Correlation Heatmap')
plt.show()

# convert Rating column to numeric type and replace 'Not given' values with 'NaN'
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# calculate number of ratings each restaurant received and assign it to a variable
rating_counts = df['restaurant_name'].value_counts()

# calculate the average rating for each restaurant
restaurant_ratings = df.groupby('restaurant_name')['rating'].mean(numeric_only=True)

# filter 'restaurant_ratings' variable to restaurants with more than 50 ratings
filtered_restaurants = restaurant_ratings[rating_counts > 50]

# filter 'filtered_restaurants' variable to restaurants with an average rating higher than 4
promotional_restaurants = filtered_restaurants[filtered_restaurants > 4]

print(promotional_restaurants)

restaurant_name
Blue Ribbon Fried Chicken   4.328
Blue Ribbon Sushi           4.219
Parm                        4.128
RedFarm Broadway            4.244
RedFarm Hudson              4.176
Shake Shack                 4.278
The Meatball Shop           4.512
Name: rating, dtype: float64

# create a 'net revenue' column initialized to float type 0.0
df['net_revenue'] = 0.0

# locate orders costing more than $20 and apply a 25% charge to the corresponding net revenue column
df.loc[df['cost_of_the_order'] > 20, 'net_revenue'] = df['cost_of_the_order'] * 0.25

# locate orders costing between $5 and $20, then apply a 15% charge to the corresponding net revenue column
df.loc[(df['cost_of_the_order'] > 5) & (df['cost_of_the_order'] <= 20), 'net_revenue'] = df['cost_of_the_order'] * 0.15

# total all the input charges in the net revenue column
total_net_revenue = df['net_revenue'].sum()

print("Total net revenue generated:", total_net_revenue)

Total net revenue generated: 6166.303

print(total_net_revenue / df['cost_of_the_order'].sum()*100)

19.691325065895317

# create a new 'total time' column by adding together 'delivery time' and 'food_preparation_time'
df['total_time'] = df['delivery_time'] + df['food_preparation_time']

# filter 'total_time' column to orders taking more than 60 minutes
# divide by total number of orders and convert into percentage
df[df['total_time'] > 60].value_counts().sum() / df['total_time'].value_counts().sum()*100

6.269757639620653

# calculate average delivery time by Day of the Week
df.groupby(df['day_of_the_week'])['delivery_time'].mean()

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.750	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.080	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.230	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.200	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.590	Weekday	4	25	24

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
1893	1476701	292602	Chipotle Mexican Grill $1.99 Delivery	Mexican	22.310	Weekend	5	31	17
1894	1477421	397537	The Smile	American	12.180	Weekend	5	31	19
1895	1477819	35309	Blue Ribbon Sushi	Japanese	25.220	Weekday	Not given	31	24
1896	1477513	64151	Jack's Wife Freda	Mediterranean	12.180	Weekday	5	23	31
1897	1478056	120353	Blue Ribbon Sushi	Japanese	19.450	Weekend	Not given	28	24

	proportion
cuisine_type
American	30.769
Japanese	24.763
Italian	15.701
Chinese	11.328
Mexican	4.057
Indian	3.846
Middle Eastern	2.582
Mediterranean	2.424
Thai	1.001
French	0.948
Southern	0.896
Korean	0.685
Spanish	0.632
Vietnamese	0.369

		count
day_of_the_week	cuisine_type
Weekday	American	169
	Japanese	135
	Italian	91
	Chinese	52
	Indian	24
	Mexican	24
	Middle Eastern	17
	Mediterranean	14
	Southern	6
	French	5
	Thai	4
	Vietnamese	3
	Korean	2
	Spanish	1
Weekend	American	415
	Japanese	335
	Italian	207
	Chinese	163
	Mexican	53
	Indian	49
	Mediterranean	32
	Middle Eastern	32
	Thai	15
	French	13
	Korean	11
	Southern	11
	Spanish	11
	Vietnamese	4

	delivery_time
day_of_the_week
Weekday	28.340
Weekend	22.470

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
order_id	1898.000	NaN	NaN	NaN	1477495.500	548.050	1476547.000	1477021.250	1477495.500	1477969.750	1478444.000
customer_id	1898.000	NaN	NaN	NaN	171168.478	113698.140	1311.000	77787.750	128600.000	270525.000	405334.000
restaurant_name	1898	178	Shake Shack	219	NaN	NaN	NaN	NaN	NaN	NaN	NaN
cuisine_type	1898	14	American	584	NaN	NaN	NaN	NaN	NaN	NaN	NaN
cost_of_the_order	1898.000	NaN	NaN	NaN	16.499	7.484	4.470	12.080	14.140	22.297	35.410
day_of_the_week	1898	2	Weekend	1351	NaN	NaN	NaN	NaN	NaN	NaN	NaN
rating	1898	4	Not given	736	NaN	NaN	NaN	NaN	NaN	NaN	NaN
food_preparation_time	1898.000	NaN	NaN	NaN	27.372	4.632	20.000	23.000	27.000	31.000	35.000
delivery_time	1898.000	NaN	NaN	NaN	24.162	4.973	15.000	20.000	25.000	28.000	33.000

	count
restaurant_name
Shake Shack	219
The Meatball Shop	132
Blue Ribbon Sushi	119
Blue Ribbon Fried Chicken	96
Parm	68

	count
customer_id
52832	13
47440	10
83287	9

FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Observations:¶

Question 1: How many rows and columns are present in the data?¶

Observations:¶

Question 2: What are the datatypes of the different columns in the dataset?¶

Observations:¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method.¶

Observations:¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed?¶

Observations:¶

Question 5: How many orders are not rated?¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.)¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received?¶

Observations:¶

Question 8: Which is the most popular cuisine on weekends?¶

Observations:¶

Question 9: What percentage of the orders cost more than 20 dollars?¶

Observations:¶

Question 10: What is the mean order delivery time?¶

Observations:¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed.¶

Observations:¶

Multivariate Analysis¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables)¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Question 14: The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders.¶

Observations:¶

Question 15: The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.)¶

Observations:¶

Question 16: The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends?¶

Observations:¶

Conclusion and Recommendations¶

Question 17: What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.)¶

Conclusions:¶

Recommendations:¶