#Reading the main datasets related to the bussiness question. 
import pandas as pd
import matplotlib.pyplot as plt
customers = pd.read_csv("../data/olist_customers_dataset.csv")
sellers =  pd.read_csv("../data/olist_sellers_dataset.csv")
#This dataset is useful to make a map based geometries
#For the visualization, geopandas is required
# See how to install it https://geopandas.org/en/stable/getting_started/install.html
import geopandas as gpd
locations =  gpd.read_file("zip://../data/states.zip")
#Ignoring user warnings about KMeans memory leak
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns


#The data availabe for the customers
customers.head(4)


customers.dtypes

customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object


#The dataset doesn't have null or nan values
customers.isnull().count()
customers.isna().count()

customer_id                 99441
customer_unique_id          99441
customer_zip_code_prefix    99441
customer_city               99441
customer_state              99441
dtype: int64


#Total of rows in the customers dataframe
display("Number of no unique customers",customers.shape[0])
#Total of rows with unique id of the customers dataframe
display("Number of customers",len(pd.unique(customers["customer_unique_id"])))

'Number of no unique customers'

99441

'Number of customers'

96096


#Getting the number number of ids (or orders) created by one user
customers_by_orders = customers.groupby("customer_unique_id").count()["customer_id"].reset_index(name='count') \
                             .sort_values(['count'], ascending=False)
#Customers who purchased one order
customers_one_order=customers_by_orders.loc[customers_by_orders["count"]==1]["customer_unique_id"]
display("customers who purchased one order")
display(customers_one_order)
# Obtaining clients who do not change their location
unduplicated_customers = customers[~customers.duplicated(subset=["customer_unique_id","customer_city","customer_zip_code_prefix","customer_state"],keep=False)]
display("unique customers")
display(unduplicated_customers)
# Removing customers of a single purchase to get customers who have different locations
customers_different_location = unduplicated_customers.loc[~unduplicated_customers["customer_unique_id"].isin(customers_one_order)]
display("customer with different locations")
display(customers_different_location)

'customers who purchased one order'

65416    adfdc84671ccd55dc3c4aa31b363bfb4
65652    ae9ae5072fe8d5b2a1f7072a833f0afc
64300    aaff146bd210808c7d004a05966472a7
65653    ae9b625c590bbfdd9e9ef836fbcbc7c9
65654    ae9b719375655dab3c518d76cc37a843
                       ...               
32508    5657dfebff5868c4dc7e8355fea865c4
32507    5657596addb4d7b07b32cd330614bdf8
32506    5656eb169546146caeab56c3ffc3d268
32505    5656a8fabc8629ff96b2bc14f8c09a27
96095    ffffd2657e2aad2907e67c3e9daecbeb
Name: customer_unique_id, Length: 93099, dtype: object

'unique customers'

'customer with different locations'


#One example of a customer who has two different locations
customers.loc[customers["customer_unique_id"]=="d55417a83a73d1702b43988e63f0731a"]


#Cleaning geometry dataset to get the geometry and the acronym of each state
#and deleting additional variables.
locations[['COUNTRY','STATE']] = locations['HASC_1'].str.split('.',expand=True)
locations = locations.drop(columns=["ID_0","ISO","NAME_0","ID_1","CCN_1","CCA_1","TYPE_1","ENGTYPE_1","NL_NAME_1","VARNAME_1","COUNTRY","HASC_1"])
locations = locations.rename(columns ={'NAME_1':'NAME'})
locations.head(5)


#Visualizing customers by state
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
#defining the figure to create
fig, ax = plt.subplots(1, 1,figsize=(20, 16))
#Joining geometries and number of customers
customer_state = pd.DataFrame(customers.groupby("customer_state").count()["customer_id"])
location_customers=locations.merge(customer_state,left_on="STATE",right_on="customer_state")
#Defining coords for set the label on each state
location_customers["coords"] = location_customers["geometry"].apply(lambda x: x.representative_point().coords[:])
location_customers["coords"] = [coords[0] for coords in location_customers["coords"]]
#Plotting data
location_customers.plot(column='customer_id',ax=ax, cmap='OrRd', scheme='natural_breaks',
           legend=True)
ax.set_title("Number of customers by state")
for idx, row in location_customers.iterrows():
    ax.annotate(row['NAME'], xy=row["coords"], horizontalalignment="center", color="black")


#Visualizing customers by city

customers_city_state= customers.groupby(["customer_state","customer_city"]).count()
customers_city_state = customers_city_state.loc[customers_city_state["customer_id"]>250]
customers_city_state["customer_id"].plot.bar(title="Cities from where a user receives a purchase",ylabel="Number of customers",figsize=(20, 10))

<AxesSubplot:title={'center':'Cities from where a user receives a purchase'}, xlabel='customer_state,customer_city', ylabel='Number of customers'>


#Visualizing the sellers dataframe
sellers.head(4)


#The variables are correctly defined
sellers.dtypes

seller_id                 object
seller_zip_code_prefix     int64
seller_city               object
seller_state              object
dtype: object


#There are no seller duplicates 
sellers.seller_id.is_unique

True


#Visualizing sellers by state
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
#defining the figure to create
fig, ax = plt.subplots(1, 1,figsize=(20, 16))
#Joining geometries and number of sellers
seller_state = pd.DataFrame(sellers.groupby("seller_state").count()["seller_id"])
seller_customers=locations.merge(seller_state,left_on="STATE",right_on="seller_state")
#Defining coords for set the label on each state
seller_customers["coords"] = seller_customers["geometry"].apply(lambda x: x.representative_point().coords[:])
seller_customers["coords"] = [coords[0] for coords in seller_customers["coords"]]
#Plotting data
seller_customers.plot(column='seller_id',ax=ax, cmap='OrRd', scheme='natural_breaks',
           legend=True)
ax.set_title("Number of sellers by state")
for idx, row in seller_customers.iterrows():
    ax.annotate(row['NAME'], xy=row["coords"], horizontalalignment="center", color="black")


#Visualizing sellers by city
sellers_city_state= sellers.groupby(["seller_state","seller_city"]).count()
sellers_city_state=sellers_city_state.loc[sellers_city_state["seller_id"]>10]
sellers_city_state["seller_id"].plot.bar(title="Cities from where a seller sends an order",ylabel="Number of sellers",figsize=(20, 10))

<AxesSubplot:title={'center':'Cities from where a seller sends an order'}, xlabel='seller_state,seller_city', ylabel='Number of sellers'>


pop_city=pd.read_csv("../data/population.csv")
#Left city names in lowercase to join with sellers and customers data
pop_city["city"]=pop_city["city"].str.lower()
# Joining with cities with more than 250 customers
customers_city_pop = customers_city_state.merge(pop_city,left_on="customer_city",right_on="city",how="left")
customers_city_pop.dropna(inplace=True)
#Normalizing by population
customers_city_pop["ratio_pop"]=(customers_city_pop["customer_id"]*100000)/customers_city_pop["2022"]
# Joining with the cities with more than 10 sellers
sellers_city_pop = sellers_city_state.merge(pop_city,left_on="seller_city",right_on="city",how="left")
sellers_city_pop.dropna(inplace=True)
#Normalizing by population
sellers_city_pop["ratio_pop"]=(sellers_city_pop["seller_id"]*100000)/sellers_city_pop["2022"]
#Making a graph for city vs ratio_pop
fig=plt.figure()
figsize=(20, 10)
# Setting up columns and defining cities as index
sellers_graph= sellers_city_pop[["city","ratio_pop"]]
sellers_graph.set_index("city",inplace=True)
sellers_graph.sort_index(inplace=True) 
customer_graph= customers_city_pop[["city","ratio_pop"]]
customer_graph.set_index("city",inplace=True)
customer_graph.sort_index(inplace=True) 
fig, axes = plt.subplots(nrows=2, ncols=1)
#Setting figure to get a better visualization
fig.set_size_inches(15.5, 10.5)
fig.tight_layout(pad=12.0)
customer_graph.plot.bar(title="Number of customers per 100,000 people per city",ax=axes[0])
ax.set_title("test")
sellers_graph.plot.bar(title="Number of sellers per 100,000 people per city",ax=axes[1])

<AxesSubplot:title={'center':'Number of sellers per 100,000 people per city'}, xlabel='city'>

<Figure size 640x480 with 0 Axes>


orders_review = pd.read_csv("../data/olist_order_reviews_dataset.csv")
#Here we are going to use pandas profile to get a report of this dataframe
from pandas_profiling import ProfileReport
order_review_profile = ProfileReport(orders_review, title="Orders review Profiling Report")
order_review_profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]


#maintaining fields with values
orders_review=orders_review.drop(columns=["review_comment_title","review_comment_message"])
#Changing score as a class instead of a number
orders_review["review_score"]=orders_review["review_score"].astype(str)


#Reading data and visualize data types for each dataframe
orders_dataset = pd.read_csv("../data/olist_orders_dataset.csv")
orders_payment = pd.read_csv("../data/olist_order_payments_dataset.csv")
display("Orders",orders_dataset.dtypes)
display("Orders payments",orders_payment.dtypes)
display("Orders: Missing values",orders_dataset.isna().sum())
display("Orders payments: Missing values",orders_payment.isna().sum())
display("Orders: Total values",orders_dataset.shape[0])
display("Orders payments: Total values",orders_payment.shape[0])

'Orders'

order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

'Orders payments'

order_id                 object
payment_sequential        int64
payment_type             object
payment_installments      int64
payment_value           float64
dtype: object

'Orders: Missing values'

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

'Orders payments: Missing values'

order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

'Orders: Total values'

99441


# Make a graph with payments type distribution
orders_payment.groupby(['payment_type']).count().reset_index()[["payment_type", "order_id"]].rename(columns={"order_id":"count"}).plot.bar(x="payment_type")

<AxesSubplot:xlabel='payment_type'>


#changing strings to datetimes
orders_dataset["order_purchase_timestamp"] = pd.to_datetime(orders_dataset["order_purchase_timestamp"])
orders_dataset["order_approved_at"] = pd.to_datetime(orders_dataset["order_approved_at"])
orders_dataset["order_delivered_carrier_date"] = pd.to_datetime(orders_dataset["order_delivered_carrier_date"])
orders_dataset["order_delivered_customer_date"] = pd.to_datetime(orders_dataset["order_delivered_customer_date"])
orders_dataset["order_estimated_delivery_date"] = pd.to_datetime(orders_dataset["order_estimated_delivery_date"])


#The idea is to analyze the behavior of the sales process, for this we only require orders that have already been delivered.
orders_delivered=orders_dataset.loc[orders_dataset["order_status"]=='delivered'].copy()
display("shape before",orders_delivered.shape)
display(orders_delivered.isna().sum())
#Because we have just a few values missing we are goint to remove them taking account those observations represent 0.02% of the data
orders_delivered.dropna(inplace=True)
display("shape after",orders_delivered.shape)

'shape before'

(96478, 8)

order_id                          0
customer_id                       0
order_status                      0
order_purchase_timestamp          0
order_approved_at                14
order_delivered_carrier_date      2
order_delivered_customer_date     8
order_estimated_delivery_date     0
dtype: int64

'shape after'

(96455, 8)


#If we take the purchase timestamp as our baseline to define the times of the entire sales process, we can get an idea of the general behavior
orders_delivered["base_approved"]=(orders_dataset["order_approved_at"] - orders_dataset["order_purchase_timestamp"])/pd.Timedelta('1h')
orders_delivered["base_delivered_carrier"]=(orders_dataset["order_delivered_carrier_date"] - orders_dataset["order_purchase_timestamp"])/pd.Timedelta('1h')
orders_delivered["base_delivered_customer"]=(orders_dataset["order_delivered_customer_date"] - orders_dataset["order_purchase_timestamp"])/pd.Timedelta('1h')
orders_delivered["base_estimated_delivery"]=(orders_dataset["order_estimated_delivery_date"] - orders_dataset["order_purchase_timestamp"])/pd.Timedelta('1h')
orders_delivered["estimated_delivery_delivered_customer"]=(orders_dataset["order_estimated_delivery_date"] - orders_dataset["order_delivered_customer_date"])/pd.Timedelta('1h')
#Make a graph to see the time differences
orders_delivered_melt = orders_delivered.melt(id_vars=['order_id'], value_vars=["base_delivered_carrier","base_delivered_customer","base_estimated_delivery"],
        var_name='time_type', value_name='difference_times')
sns.set(rc={'figure.figsize':(11.7,8.27)})

sns.violinplot(data=orders_delivered_melt, x="time_type", y="difference_times")

<AxesSubplot:xlabel='time_type', ylabel='difference_times'>


orders_delivered.loc[orders_delivered["base_delivered_carrier"]<-100][["order_purchase_timestamp","order_delivered_carrier_date","base_delivered_carrier"]]


orders_delivered.loc[orders_delivered["base_delivered_carrier"]<0,"base_delivered_carrier"]=0
#Make a graph to see the time differences
orders_delivered_melt = orders_delivered.melt(id_vars=['order_id'], value_vars=["base_delivered_carrier","base_delivered_customer","base_estimated_delivery"],
        var_name='time_type', value_name='difference_times')
sns.violinplot(data=orders_delivered_melt, x="time_type", y="difference_times")

<AxesSubplot:xlabel='time_type', ylabel='difference_times'>


#Make a graph to see the time differences
sns.violinplot(data=orders_delivered["estimated_delivery_delivered_customer"]).set(title='Difference between Estimated deliver time and Delivered time (Hours)')

[Text(0.5, 1.0, 'Difference between Estimated deliver time and Delivered time (Hours)')]


#Including data to relate customers and sellers
order_items = pd.read_csv("../data/olist_order_items_dataset.csv")


#Joining customers and sellers through the order id
sellers_customers=orders_dataset.loc[orders_dataset["order_status"]=="delivered",["order_id","customer_id"]].merge(order_items[["order_id","seller_id"]]).drop_duplicates()


## Adding seller and customers location
# For these joins we are going to leave the default behavior which is inner, because we need the rows that have both values. 
# This was analyzed before and we didn't have missing data. 
sellers_customers=sellers_customers.merge(sellers[["seller_state","seller_id"]])
#We are using the customer id instead of the unique id because how we defined before each unique has a different location but for this analysis we are assuming each row as a new customer to include all the possible locations for each customer. 
sellers_customers=sellers_customers.merge(customers[["customer_id","customer_state"]])


order_states = sellers_customers.groupby(["seller_state","customer_state"]).agg({"order_id":"count"}).reset_index()


sns.violinplot(order_states["order_id"])

<AxesSubplot:>


#As we have a large range we can use a logarithmic scale to show the relation between seller and customer states
from math import log10
order_states["log_order_id"] =order_states["order_id"].apply(log10)
sns.heatmap(order_states.pivot("seller_state", "customer_state", "log_order_id"))

<AxesSubplot:xlabel='customer_state', ylabel='seller_state'>


#Showing relationship between variables
rscore_time = orders_review.merge(orders_delivered)[["review_score", 'base_approved', 'base_delivered_carrier', 'base_delivered_customer',
       'base_estimated_delivery', 'estimated_delivery_delivered_customer' ]]
rscore_time["review_score_f"] = rscore_time["review_score"].astype("float")
sns.heatmap(rscore_time.corr(), annot=True)

<AxesSubplot:>


#Note: product characteristics are assumed to be encoded in the seller and the freight value.
grouped_order_items = order_items.groupby(["order_id","seller_id"]).agg({"price":"sum", "product_id":"count", "freight_value":"sum"}).reset_index()


# In this case we keep the seller id as a the seller internal processes can influence the delivery time. 
orig_dataset=orders_delivered[["base_delivered_customer","order_id"]].merge(grouped_order_items).merge(sellers_customers).drop(columns=["order_id", "customer_id"])
sns.heatmap(orig_dataset.corr())

<AxesSubplot:>


#Including customer and seller location
sns.heatmap(pd.get_dummies(orig_dataset.drop(columns=["seller_id"])).corr())

<AxesSubplot:>


#The seller is important to determine behavior implicit by each seller. In this case, we are using a label encoder instead of dummy variables to reduce the complexity of the model.
from sklearn import preprocessing
label_encoder_seller = preprocessing.LabelEncoder()
label_encoder_seller.fit(orig_dataset.seller_id)
orig_dataset["seller_id_enc"] = label_encoder_seller.transform(orig_dataset.seller_id)
##Generating fuinal dataset to create model
dataset_t = pd.get_dummies(orig_dataset.drop(columns=["seller_id"]))
#Removing high outliers, there are no reason to have more than 2000 hours to delivery an specific item.
dataset_t= dataset_t.loc[dataset_t.base_delivered_customer<2000]


from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
#Scaling data and defining independent and dependent variables
scaler = StandardScaler()
target="base_delivered_customer"
X = dataset_t.drop(columns=[target])
scaler.fit(X)
X = scaler.transform(X)
y = dataset_t[target]


##Distribution of target variable
ax = sns.histplot(dataset_t.base_delivered_customer)
ax.set(xlabel='Delivered time to customer (Hours)', ylabel='Frequency')
plt.show()


#Showing the difference between estimated times and actual delivery times
sns.histplot(orders_delivered.estimated_delivery_delivered_customer-orders_delivered.base_delivered_customer)

<AxesSubplot:ylabel='Count'>


from sklearn.neighbors import KNeighborsRegressor
#Defining K
error_rate = []
for i in range(1,100,5):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X,y)
    pred_i = knn.predict(X)
    error_rate.append(-cross_val_score(knn, X, y, cv=10,scoring= 'neg_root_mean_squared_error').mean())


#Getting better K to model
sns.lineplot(x=range(1,100,5), y=error_rate)

<AxesSubplot:>


#Using better K based on previous Graph
from sklearn.neighbors import KNeighborsRegressor
regr = KNeighborsRegressor(n_neighbors=15)
##Getting rmse for each fold
cross_val_score(regr, X, y, cv=10,scoring= 'neg_root_mean_squared_error')

array([-183.53965948, -184.00820862, -181.05569141, -180.13306799,
       -183.22288605, -183.34437028, -183.47975409, -182.41792475,
       -188.51680134, -182.15439488])


regr.fit(X,y)
#Distribution between differences from Predicted and actual values
sns.histplot(regr.predict(X)-y)

<AxesSubplot:xlabel='base_delivered_customer', ylabel='Count'>


from sklearn import linear_model
import numpy as np
#Defining better alpha
error_rate_alpha = []
for i in np.arange(0.0,1.0,0.1):
    clf = linear_model.Lasso(alpha=i)
    clf.fit(X,y)
    clf_i = clf.predict(X)
    error_rate_alpha.append(-cross_val_score(clf, X, y, cv=10,scoring= 'neg_root_mean_squared_error').mean())


#Getting better alpha to model
sns.lineplot(x=np.arange(0.0,1.0,0.1), y=error_rate_alpha)

<AxesSubplot:>


clf = linear_model.Lasso(alpha=0.1)
##Getting rmse for each fold
cross_val_score(clf, X, y, cv=10,scoring= 'neg_root_mean_squared_error')

array([-186.55442102, -188.13532602, -185.38760278, -183.59172968,
       -187.91146203, -185.97658646, -186.99953442, -186.79473693,
       -192.13146013, -186.21168646])


clf.fit(X,y)
#Distribution between differences from Predicted and actual values
sns.histplot(clf.predict(X)-y)

<AxesSubplot:xlabel='base_delivered_customer', ylabel='Count'>

	customer_id	customer_unique_id	customer_zip_code_prefix	customer_city	customer_state
0	06b8999e2fba1a1fbc88172c00ba8bc7	861eff4711a542e4b93843c6dd7febb0	14409	franca	SP
1	18955e83d337fd6b2def6b18a428ac77	290c77bc529b7ac935b93aa66c333dc3	9790	sao bernardo do campo	SP
2	4e7b3e00288586ebd08712fdd0374a03	060e732b5b29e8181a18229c7b0b2b5e	1151	sao paulo	SP
3	b2b6027bc5c5109e529d4dc6358b12c3	259dac757896d24d7702b9acbbff3f3c	8775	mogi das cruzes	SP

	customer_id	customer_unique_id	customer_zip_code_prefix	customer_city	customer_state
0	06b8999e2fba1a1fbc88172c00ba8bc7	861eff4711a542e4b93843c6dd7febb0	14409	franca	SP
1	18955e83d337fd6b2def6b18a428ac77	290c77bc529b7ac935b93aa66c333dc3	9790	sao bernardo do campo	SP
2	4e7b3e00288586ebd08712fdd0374a03	060e732b5b29e8181a18229c7b0b2b5e	1151	sao paulo	SP
3	b2b6027bc5c5109e529d4dc6358b12c3	259dac757896d24d7702b9acbbff3f3c	8775	mogi das cruzes	SP
4	4f2d8ab171c80ec8364f7c12e35b23ad	345ecd01c38d18a9036ed96c73b8d066	13056	campinas	SP
...	...	...	...	...	...
99436	17ddf5dd5d51696bb3d7c6291687be6f	1a29b476fee25c95fbafc67c5ac95cf8	3937	sao paulo	SP
99437	e7b71a9017aa05c9a7fd292d714858e8	d52a67c98be1cf6a5c84435bd38d095d	6764	taboao da serra	SP
99438	5e28dfe12db7fb50a4b2f691faecea5e	e9f50caf99f032f0bf3c55141f019d99	60115	fortaleza	CE
99439	56b18e2166679b8a959d72dd06da27f9	73c2643a0a458b49f58cea58833b192e	92120	canoas	RS
99440	274fa6071e5e17fe303b9748641082c8	84732c5050c01db9b23e19ba39899398	6703	cotia	SP

	customer_id	customer_unique_id	customer_zip_code_prefix	customer_city	customer_state
5	879864dab9bc3047522c92c82e1212b8	4c93744516667ad3b8f1fb645a3116a4	89254	jaragua do sul	SC
387	13e8265ea7e8a11df7a56799f5806220	d55417a83a73d1702b43988e63f0731a	8576	itaquaquecetuba	SP
453	302025f399c7405c509a297b6b9a1e73	7bf1c59401206e31a78523077f3a1144	6765	taboao da serra	SP
559	079da973ecd0fb654da4ffdb946c9ea1	527c6fbbe85cdf78ff1645e0a71e266f	1455	sao paulo	SP
562	05425c98e35d762211742df2a2b30af0	87a7c3090f96d9da963125a0d7f8193a	6114	osasco	SP
...	...	...	...	...	...
99182	bafd51862cb88bd42d499170c2ac35fd	408aee96c75632a92e5079eee61da399	22260	rio de janeiro	RJ
99227	1bd05d9a0031e8461d15550893d9b85a	13abc50b97af7425b5066e405d7cd760	7793	cajamar	SP
99297	8b9f55ee19f0356ba6b3f03485e6922d	bc7b9e0d078c0c01f622b38cfcd7ee9c	20521	rio de janeiro	RJ
99353	282fbce48e4d2077aad602dd125c9225	0ceb502fc33a2ad327b08288c5310e2e	29134	viana	ES
99406	d9110683c7a282144e9fc97660026a28	5cbfdb85ec130898108b32c50d619c39	74980	aparecida de goiania	GO

	customer_id	customer_unique_id	customer_zip_code_prefix	customer_city	customer_state
387	13e8265ea7e8a11df7a56799f5806220	d55417a83a73d1702b43988e63f0731a	8576	itaquaquecetuba	SP
56661	ee910160bbbdf7ba4b1dbd4dbc1db3ae	d55417a83a73d1702b43988e63f0731a	3613	sao paulo	SP

	NAME	geometry	STATE
0	Acre	POLYGON ((-73.33251 -7.32488, -73.27482 -7.350...	AC
1	Alagoas	MULTIPOLYGON (((-35.90153 -9.86180, -35.90153 ...	AL
2	Amapá	MULTIPOLYGON (((-50.02403 0.85986, -50.02403 0...	AP
3	Amazonas	POLYGON ((-67.32623 2.02968, -67.30118 1.92997...	AM
4	Bahia	MULTIPOLYGON (((-38.69708 -17.97903, -38.69708...	BA

E-commerce business data analysis

by Lorena Peñuela Cantor

Website

Content¶

Introduction¶

Goal¶

Collaboration Plan¶

Datasets¶

Exploratory data analysis¶

Customers Location¶

Sellers Location¶

First Question¶

Order reviews¶

Orders¶

Second Question¶

Model¶

Question 1¶

Answer 1¶

Question 2¶

KNeighbors¶

Lasso¶

Conclusion¶

	seller_id	seller_zip_code_prefix	seller_city	seller_state
0	3442f8959a84dea7ee197c632cb2df15	13023	campinas	SP
1	d1b65fc7debc3361ea86b5f14c68d2e2	13844	mogi guacu	SP
2	ce3ad9de960102d0677a81f5d0bb7b2d	20031	rio de janeiro	RJ
3	c0f3eea2e14555b6faeea3dd58c1b1c3	4195	sao paulo	SP

	order_purchase_timestamp	order_delivered_carrier_date	base_delivered_carrier
25883	2018-07-16 18:40:53	2018-01-26 13:35:00	-4109.098056
83321	2018-08-18 11:49:40	2018-08-14 06:22:00	-101.461111