import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datetime, nltk, warnings
import matplotlib.cm as cm
import seaborn as sns
import itertools
import missingno as msno
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing, model_selection, metrics, feature_selection
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import neighbors, linear_model, svm, tree, ensemble
from wordcloud import WordCloud, STOPWORDS
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from pathlib import Path
from IPython.display import display, HTML
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
%matplotlib inline
from sklearn.cluster import KMeans
df = pd.read_csv('/Users/anusha/Desktop/Assignments-Spring2020/AIT-582/Project/data.csv',encoding="ISO-8859-1",dtype={'CustomerID': str,'InvoiceNo': str})
print('Dataframe dimensions:', df.shape)
#______
Converting the Invoice Date variable from Date to Datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
Display first 5 lines of the data frame
df.head()
df.shape
df.info()
df.describe()
Display number of missing values grouped by column
data_clean=pd.DataFrame(df.dtypes).T.rename(index={0:'column type'})
data_clean=data_clean.append(pd.DataFrame(df.isnull().sum()).T.rename(index={0:'null values (nb)'}))
data_clean=data_clean.append(pd.DataFrame(df.isnull().sum()/df.shape[0]*100).T.
rename(index={0:'null values (%)'}))
print ('-' * 10 + " Variable type and number of missing values " + '-' * 10 )
print
display(data_clean)
Visualize missing values : We see that Description and CustomerId has missing values.
msno.matrix(df)
Dropping missing values
df.dropna(inplace = True)
print('Dataframe dimensions:',df.shape)
# gives some information on columns types and number of null values
data_clean=pd.DataFrame(df.dtypes).T.rename(index={0:'column type'})
data_clean=data_clean.append(pd.DataFrame(df.isnull().sum()).T.rename(index={0:'null values (nb)'}))
data_clean=data_clean.append(pd.DataFrame(df.isnull().sum()/df.shape[0]*100).T.
rename(index={0:'null values (%)'}))
display(data_clean)
Dropping duplicate values
print('Duplicate data entries: {}'.format(df.duplicated().sum()))
df.drop_duplicates(inplace = True)
Customers = df.groupby('CustomerID')['Country'].unique()
Customers.loc[Customers.apply(lambda x:len(x)>1)]
Exploring the total number of countries
df_temp = df[['CustomerID', 'InvoiceNo', 'Country']].groupby(
['CustomerID', 'InvoiceNo', 'Country']).count()
df_temp = df_temp.reset_index(drop = False)
countries = df_temp['Country'].value_counts()
print('No. of countries in dataframe: {}'.format(len(countries)))
no_countries = df[['CustomerID','Country']].groupby(['Country']).count()
no_countries = no_countries.reset_index(drop = False)
print('-' * 10 + " Distribution of orders per country "+ '-' * 10)
print
print (no_countries.sort_values(
by='CustomerID', ascending=False).rename(index=str,
columns={"CustomerID": "No of orders per Country"}))
Creating Final Price,Invoice Month and Day of Week for analysis
for i,v in df.groupby('CustomerID')['Country'].unique().items():
if len(v)>1:
df.Country[df['CustomerID'] == i] = df.Country[df['CustomerID'] == i].mode()[0]
df['FinalPrice'] = df['Quantity']*df['UnitPrice']
df['InvoiceMonth'] = df['InvoiceDate'].apply(lambda x: x.strftime('%B'))
df['Day of week'] = df['InvoiceDate'].dt.day_name()
Plots of top selling products by Amount and Value
sns.set_style('whitegrid')
Quan = df.groupby('Description')['Quantity'].agg('sum').sort_values(ascending=False)[0:20]
Price = df.groupby('Description')['FinalPrice'].agg('sum').sort_values(ascending=False)[0:20]
#creating the subplot
fig,axs = plt.subplots(nrows=2, ncols=1, figsize = (12,12))
plt.subplots_adjust(hspace = 0.3)
fig.suptitle('Top 20 best Selling Products by Amount and Value', fontsize=15, x = 0.4, y = 0.98)
sns.barplot(x=Quan.values, y=Quan.index, ax= axs[0]).set(xlabel='Total amount of sales')
axs[0].set_title('Based on Amount', size=12, fontweight = 'bold')
sns.barplot(x=Price.values, y=Price.index, ax= axs[1]).set(xlabel='Total value of sales')
axs[1].set_title('Based on Value', size=12, fontweight = 'bold')
plt.show()
Plot of Quanity Vs Price
Corr = sns.jointplot(x="Quantity", y="UnitPrice", data = df[df.FinalPrice>0], height = 7)
Corr.fig.suptitle("Price and Quantity Comparison", fontsize = 15, y = 1.1)
plt.show()
Plots of Top returned items and Customers who did most Returns
ReturnedItems = df[df.Quantity<0].groupby('Description')['Quantity'].sum()
ReturnedItems = ReturnedItems.abs().sort_values(ascending=False)[0:10]
ReturnCust = df[df.Quantity<0].groupby(['CustomerID','Country'])['Quantity'].sum()
ReturnCust = ReturnCust.abs().sort_values(ascending=False)[0:10]
#creting the subplot
fig, [ax1, ax2] = plt.subplots(nrows=2, ncols=1, figsize=(12,10))
ReturnedItems.sort_values().plot(kind='barh', ax=ax1).set_title('Top Returned Items', fontsize=15)
ReturnCust.sort_values().plot(kind='barh', ax=ax2).set_title('Customers who did most Returns', fontsize=15)
ax1.set(xlabel='Quantity')
ax2.set(xlabel='Quantity')
plt.subplots_adjust(hspace=0.4)
plt.show()
Plots showing sales and returns grouped by countrys
ByCountrySale = df[(df.Country != 'UNITED KINGDOM') & (df.Quantity > 0)].groupby('Country')['Quantity'].sum()
ByCountryRet = df[(df.Country != 'UNITED KINGDOM') & (df.Quantity < 0)].groupby('Country')['Quantity'].sum().abs()
fig, [ax1,ax2] = plt.subplots(nrows=2,ncols=1,figsize=(10,14))
ByCountrySale.plot(kind='bar', ax=ax1).set(ylabel = 'Quantity',xlabel='')
ax1.set_title('Sales', size=12, fontweight = 'bold')
ByCountryRet.plot(kind='bar', ax=ax2).set(ylabel = 'Quantity',xlabel='')
ax2.set_title('Returns', size=12, fontweight = 'bold')
plt.suptitle('Sales and Returns in all Countries except UK', fontsize = 15)
plt.subplots_adjust(hspace = 0.6)
plt.show()
Pie chart showing Sales value grouped by day
df.groupby('Day of week')['FinalPrice'].sum().plot(kind = 'pie', autopct = '%.2f%%', figsize=(7,7)).set(ylabel='')
plt.title('% of Sales Value by Day of Week', fontsize = 15)
plt.show()
Plots showing the top repeated customers
MostRepeat = df.groupby(['CustomerID','Country'])['InvoiceNo'].nunique().sort_values(ascending=False)
rep = MostRepeat[MostRepeat != 1].values
nrep = MostRepeat[MostRepeat == 1].values
ser = pd.Series([len(rep)/(len(rep)+len(nrep)),len(nrep)/(len(rep)+len(nrep))], index=['Repeat Customers','One-time Customers'])
fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2, figsize= (15,5), gridspec_kw= {'width_ratios':[3,1]})
plt.subplots_adjust(wspace=0.2)
sns.barplot(x=MostRepeat[0:10].values, y=MostRepeat[0:10].index, ax=ax1).set(xlabel='Number of Transactions(Repeats)',ylabel='CustomerID')
ser.plot(kind='pie', autopct='%.2f%%', ax=ax2).set(ylabel='')
plt.suptitle('Top Repeat Customers', fontsize=15)
plt.show()
Plot showing the Sales value grouped by Month and Day
HM_Data = df.pivot_table(index = 'InvoiceMonth',columns = 'Day of week', values = 'FinalPrice', aggfunc='sum')
plt.figure(figsize = (10,6))
sns.heatmap(HM_Data, cmap = 'vlag').set(xlabel='', ylabel='')
plt.title('Sales Value per Month and Day of Week', fontsize = 15)
plt.show()
choropleth map showing Sales in each country
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio as py
import cufflinks as cf
import pycountry
import pycountry_convert
from pycountry_convert import country_name_to_country_alpha3
init_notebook_mode(connected=True)
%matplotlib inline
def code(x):
try:
return country_name_to_country_alpha3(x,cn_name_format='upper')
except:
pass
df['CountryCode'] = df['Country'].map(code)
#creating the choropleth map
mapdata = df.dropna()
map_data = {
'type': 'choropleth',
'autocolorscale' : False,
'colorscale' : 'Portland',
'locations' : mapdata[mapdata['CountryCode']!='GBR'].groupby('CountryCode', sort=False)['FinalPrice'].sum().index,
'z' : mapdata[mapdata['CountryCode']!='GBR'].groupby('CountryCode', sort=False)['FinalPrice'].sum().values,
'text' : mapdata[mapdata['CountryCode']!='GBR']['Country'].unique(),
'colorbar' : {'title':'Pounds'}
}
map_layout = {
'title' : 'Sales in Foreign Countries',
'geo' : {'showframe' : False, 'projection' : {'type':'equirectangular'}}
}
FinalMap = go.Figure(data = [map_data], layout = map_layout)
iplot(FinalMap)
Exploring Customers and Products and number of transactions made
pd.DataFrame([{'products': len(df['StockCode'].value_counts()),'transactions': len(df['InvoiceNo'].value_counts()),
'customers': len(df['CustomerID'].value_counts()),}], columns = ['products', 'transactions', 'customers'],
index = ['quantity'])
Exploring the transactions grouped by CustomerId
exp = df.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate'].count()
no_products = exp.rename(columns = {'InvoiceDate':'Number of products'})
no_products[:10].sort_values('CustomerID')
Analysis on cancelled orders
no_products['order_cancelled'] = no_products['InvoiceNo'].apply(lambda x:int('C' in x))
display(no_products[:5])
a = no_products['order_cancelled'].sum()
b = no_products.shape[0]
percentage = (a/b)*100
print('Number of orders cancelled: {}/{} ({:.2f}%) '.format(a, b, percentage))
display(df.sort_values('CustomerID')[:10])
We observe that when a transaction is cancelled, there is another transaction which is made identical
df_cancel = df[df['Quantity'] < 0][['CustomerID','Quantity',
'StockCode','Description','UnitPrice']]
for index, col in df_cancel.iterrows():
if df[(df['CustomerID'] == col[0]) & (df['Quantity'] == -col[1])
& (df['Description'] == col[2])].shape[0] == 0:
print(df_cancel.loc[index])
print(15*'-'+'>'+' HYPOTHESIS NOT FULFILLED')
break
We see that the initial hypothesis is not fulfilled because of a 'Discount' entry. so re running discrading the discount entry
df_cancel = df[(df['Quantity'] < 0) & (df['Description'] != 'Discount')][
['CustomerID','Quantity','StockCode',
'Description','UnitPrice']]
for index, col in df_cancel.iterrows():
if df[(df['CustomerID'] == col[0]) & (df['Quantity'] == -col[1])
& (df['Description'] == col[2])].shape[0] == 0:
print(index, df_cancel.loc[index])
print(15*'-'+'>'+' HYPOTHESIS NOT FULFILLED')
break
Hypothesis is not verified again .
Below we are checking two scenarios: 1)If a cancel order exists without counterpart 2)If there's at least one counterpart with the exact same quantity
df_copy = df.copy(deep = True)
df_copy['QuantityCanceled'] = 0
entry_to_remove = [] ; doubtfull_entry = []
for index, col in df.iterrows():
if (col['Quantity'] > 0) or col['Description'] == 'Discount': continue
df_test = df[(df['CustomerID'] == col['CustomerID']) &
(df['StockCode'] == col['StockCode']) &
(df['InvoiceDate'] < col['InvoiceDate']) &
(df['Quantity'] > 0)].copy()
# Cancelation WITHOUT counterpart
if (df_test.shape[0] == 0):
doubtfull_entry.append(index)
# Cancelation WITH a counterpart
elif (df_test.shape[0] == 1):
index_order = df_test.index[0]
df_copy.loc[index_order, 'QuantityCanceled'] = -col['Quantity']
entry_to_remove.append(index)
# Various counterparts exist in orders: we delete the last one
elif (df_test.shape[0] > 1):
df_test.sort_index(axis=0 ,ascending=False, inplace = True)
for ind, val in df_test.iterrows():
if val['Quantity'] < -col['Quantity']: continue
df_copy.loc[ind, 'QuantityCanceled'] = -col['Quantity']
entry_to_remove.append(index)
break
print("entry_to_remove: {}".format(len(entry_to_remove)))
print("doubtfull_entry: {}".format(len(doubtfull_entry)))
df_copy.drop(entry_to_remove, axis = 0, inplace = True)
df_copy.drop(doubtfull_entry, axis = 0, inplace = True)
remaining_entries = df_copy[(df_copy['Quantity'] < 0) & (df_copy['StockCode'] != 'D')]
print("nb of entries to delete: {}".format(remaining_entries.shape[0]))
remaining_entries[:5]
df_copy[(df_copy['CustomerID'] == 14048) & (df_copy['StockCode'] == '22464')]
list_special_codes = df_copy[df_copy['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique()
list_special_codes
for code in list_special_codes:
print("{:<15} -> {:<30}".format(code, df_copy[df_copy['StockCode'] == code]['Description'].unique()[0]))
Analysis of Basket Price
df_copy['TotalPrice'] = df_copy['UnitPrice'] * (df_copy['Quantity'] - df_copy['QuantityCanceled'])
df_copy.sort_values('CustomerID')[:5]
Purchase amount for every single order
order = df_copy.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['TotalPrice'].sum()
price_of_basket = order.rename(columns = {'TotalPrice':'Basket Price'})
# date of the order
df_copy['InvoiceDate_int'] = df_copy['InvoiceDate'].astype('int64')
order = df_copy.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate_int'].mean()
df_copy.drop('InvoiceDate_int', axis = 1, inplace = True)
price_of_basket.loc[:, 'InvoiceDate'] = pd.to_datetime(order['InvoiceDate_int'])
# selection of significant entries
price_of_basket = price_of_basket[price_of_basket['Basket Price'] > 0]
price_of_basket.sort_values('CustomerID')[:6]
Distribution of orders and their total amount of purchases
price_list = [0, 50, 100, 200, 500, 1000, 5000, 50000]
count_price = []
for i, price in enumerate(price_list):
if i == 0: continue
val = price_of_basket[(price_of_basket['Basket Price'] < price) &
(price_of_basket['Basket Price'] > price_list[i-1])]['Basket Price'].count()
count_price.append(val)
# Representation of the number of purchases / amount
plt.rc('font', weight='bold')
f, ax = plt.subplots(figsize=(11, 6))
colors = ['yellowgreen', 'gold', 'wheat', 'c', 'violet', 'royalblue','firebrick']
labels = [ '{}<.<{}'.format(price_list[i-1], s) for i,s in enumerate(price_list) if i != 0]
sizes = count_price
explode = [0.0 if sizes[i] < 100 else 0.0 for i in range(len(sizes))]
ax.pie(sizes, explode = explode, labels=labels, colors = colors,
autopct = lambda x:'{:1.0f}%'.format(x) if x > 1 else '',
shadow = False, startangle=0)
ax.axis('equal')
f.text(0.5, 1.01, "Distribution of orders based on their total amount of purchases", ha='center', fontsize = 18);
Analysis of the product categories
is_noun = lambda pos: pos[:2] == 'NN'
def keywords_inventory(dataframe, colonne = 'Description'):
stemmer = nltk.stem.SnowballStemmer("english")
keywords_roots = dict() # collect the words / root
keywords_select = dict() # association: root <-> keyword
category_keys = []
count_keywords = dict()
icount = 0
for s in dataframe[colonne]:
if pd.isnull(s): continue
lines = s.lower()
tokenized = nltk.word_tokenize(lines)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
for t in nouns:
t = t.lower() ; racine = stemmer.stem(t)
if racine in keywords_roots:
keywords_roots[racine].add(t)
count_keywords[racine] += 1
else:
keywords_roots[racine] = {t}
count_keywords[racine] = 1
for s in keywords_roots.keys():
if len(keywords_roots[s]) > 1:
min_length = 1000
for k in keywords_roots[s]:
if len(k) < min_length:
clef = k ; min_length = len(k)
category_keys.append(clef)
keywords_select[s] = clef
else:
category_keys.append(list(keywords_roots[s])[0])
keywords_select[s] = list(keywords_roots[s])[0]
print("number of keywords in variable '{}': {}".format(colonne,len(category_keys)))
return category_keys, keywords_roots, keywords_select, count_keywords
df_products = pd.DataFrame(df['Description'].unique()).rename(columns = {0:'Description'})
keywords, keywords_roots, keywords_select, count_keywords = keywords_inventory(df_products)
products = []
for k,v in count_keywords.items():
products.append([keywords_select[k],v])
products.sort(key = lambda x:x[1], reverse = True)
liste = sorted(products, key = lambda x:x[1], reverse = True)
plt.rc('font', weight='normal')
fig, ax = plt.subplots(figsize=(7, 25))
y_axis = [i[1] for i in liste[:125]]
x_axis = [k for k,i in enumerate(liste[:125])]
x_label = [i[0] for i in liste[:125]]
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 13)
plt.yticks(x_axis, x_label)
plt.xlabel("Number of occurences", fontsize = 18, labelpad = 10)
ax.barh(x_axis, y_axis, align = 'center')
ax = plt.gca()
ax.invert_yaxis()
plt.title("Words occurence",bbox={'facecolor':'k', 'pad':5}, color='w',fontsize = 25)
plt.show()
Forming product categories by keywords
products = []
for k,v in count_keywords.items():
word = keywords_select[k]
if word in ['pink', 'blue', 'tag', 'green', 'orange']: continue
if len(word) < 3 or v < 13: continue
if ('+' in word) or ('/' in word): continue
products.append([word, v])
products.sort(key = lambda x:x[1], reverse = True)
print('Preserved words:', len(products))
liste_produits = df_copy['Description'].unique()
#print(liste_produits[0:2])
X = pd.DataFrame()
for key, occurence in products:
X.loc[:, key] = list(map(lambda x:int(key.upper() in x), liste_produits))
level = [0, 1, 2, 3, 5, 10]
label = []
for i in range(len(level)):
if i == len(level)-1:
col = '.>{}'.format(level[i])
else:
col = '{}<.<{}'.format(level[i],level[i+1])
#print(i)
#print(col)
label.append(col)
X.loc[:, col] = 0
for i, prod in enumerate(liste_produits):
prix = df_copy[ df_copy['Description'] == prod]['UnitPrice'].mean()
#print (prix)
j = 0
while prix > level[j]:
j+=1
if j == len(level): break
X.loc[i, label[j-1]] = 1
print("{:<8} {:<20} \n".format('range', 'number of products') + 20*'-')
for i in range(len(level)):
if i == len(level)-1:
col = '.>{}'.format(level[i])
else:
col = '{}<.<{}'.format(level[i],level[i+1])
print("{:<10} {:<20}".format(col, X.loc[:, col].sum()))
Creating clusters of products
range_clus = X.values
for n_clusters in range(3,10):
kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=30)
kmeans.fit(range_clus)
clusters = kmeans.predict(range_clus)
silhouette_avg = silhouette_score(range_clus, clusters)
print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)
n_clusters = 5
silhouette_avg = -1
while silhouette_avg < 0.145:
kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=30)
kmeans.fit(range_clus)
clusters = kmeans.predict(range_clus)
silhouette_avg = silhouette_score(range_clus, clusters)
print("For n_clusters =", n_clusters, "The avg silhouette_score:", silhouette_avg)
Representation of the content of clusters
pd.Series(clusters).value_counts()
a: Silhouette intra-cluster score
def graph_component_silhouette(n_clusters, lim_x, mat_size, sample_silhouette_values, clusters):
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
fig, ax1 = plt.subplots(1, 1)
fig.set_size_inches(8, 8)
ax1.set_xlim([lim_x[0], lim_x[1]])
ax1.set_ylim([0, mat_size + (n_clusters + 1) * 10])
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[clusters == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, alpha=0.8)
ax1.text(-0.03, y_lower + 0.5 * size_cluster_i, str(i), color = 'red', fontweight = 'bold',
bbox=dict(facecolor='white', edgecolor='black', boxstyle='round, pad=0.3'))
y_lower = y_upper + 10
sample_silhouette_values = silhouette_samples(range_clus, clusters)
graph_component_silhouette(n_clusters, [-0.07, 0.33], len(X), sample_silhouette_values, clusters)
b.Word Cloud
liste = pd.DataFrame(liste_produits)
liste_words = [word for (word, occurence) in products]
occurence = [dict() for _ in range(n_clusters)]
for i in range(n_clusters):
liste_cluster = liste.loc[clusters == i]
for word in liste_words:
if word in ['art', 'set', 'heart', 'pink', 'blue', 'tag']: continue
occurence[i][word] = sum(liste_cluster.loc[:, 0].str.contains(word.upper()))
def random_color_func(word=None, font_size=None, position=None,
orientation=None, font_path=None, random_state=None):
h = int(360.0 * tone / 255.0)
s = int(100.0 * 255.0 / 255.0)
l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
return "hsl({}, {}%, {}%)".format(h, s, l)
def make_wordcloud(liste, increment):
ax1 = fig.add_subplot(4,2,increment)
words = dict()
trunc_occurences = liste[0:150]
for s in trunc_occurences:
words[s[0]] = s[1]
wordcloud = WordCloud(width=1000,height=400, background_color='lightgrey',
max_words=1628,relative_scaling=1,
color_func = random_color_func,
normalize_plurals=False)
wordcloud.generate_from_frequencies(words)
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.axis('off')
plt.title('cluster n{}'.format(increment-1))
fig = plt.figure(1, figsize=(14,14))
color = [0, 160, 130, 95, 280, 40, 330, 110, 25]
for i in range(n_clusters):
list_cluster_occurences = occurence[i]
tone = color[i] # define the color of the words
liste = []
for key, value in list_cluster_occurences.items():
liste.append([key, value])
liste.sort(key = lambda x:x[1], reverse = True)
make_wordcloud(liste, i+1)
c: Principal Component Analysis
pca = PCA()
pca.fit(range_clus)
pca_samples = pca.transform(range_clus)
fig, ax = plt.subplots(figsize=(14, 5))
sns.set(font_scale=1)
plt.step(range(range_clus.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid',
label='cumulative explained variance')
sns.barplot(np.arange(1,range_clus.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g',
label='individual explained variance')
plt.xlim(0, 100)
ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()])
plt.ylabel('Explained variance', fontsize = 14)
plt.xlabel('Principal components', fontsize = 14)
plt.legend(loc='upper left', fontsize = 13);
pca = PCA(n_components=50)
matrix_9D = pca.fit_transform(range_clus)
mat = pd.DataFrame(matrix_9D)
mat['cluster'] = pd.Series(clusters)
import matplotlib.patches as mpatches
sns.set_style("white")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2.5})
LABEL_COLOR_MAP = {0:'r', 1:'gold', 2:'b', 3:'k', 4:'c', 5:'g'}
label_color = [LABEL_COLOR_MAP[l] for l in mat['cluster']]
fig = plt.figure(figsize = (12,10))
increment = 0
for ix in range(4):
for iy in range(ix+1, 4):
increment += 1
ax = fig.add_subplot(3,3,increment)
ax.scatter(mat[ix], mat[iy], c= label_color, alpha=0.4)
plt.ylabel('PCA {}'.format(iy+1), fontsize = 12)
plt.xlabel('PCA {}'.format(ix+1), fontsize = 12)
ax.yaxis.grid(color='lightgray', linestyle=':')
ax.xaxis.grid(color='lightgray', linestyle=':')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
if increment == 9: break
if increment == 9: break
comp_handler = []
for i in range(5):
comp_handler.append(mpatches.Patch(color = LABEL_COLOR_MAP[i], label = i))
plt.legend(handles=comp_handler, bbox_to_anchor=(1.1, 0.97),
title='Cluster',
shadow = True, frameon = True, framealpha = 1,fontsize = 13,
bbox_transform = plt.gcf().transFigure) #facecolor = 'lightgrey',
plt.tight_layout()
Categorizing products
corresp = dict()
for key, val in zip (liste_produits, clusters):
corresp[key] = val
df_copy['categ_product'] = df_copy.loc[:, 'Description'].map(corresp)
df_copy[['InvoiceNo', 'Description',
'categ_product']][:10]
Creating CategN which contains the total amount spent in each category
for i in range(5):
col = 'categ_{}'.format(i)
df_temp = df_copy[df_copy['categ_product'] == i]
price_temp = df_temp['UnitPrice'] * (df_temp['Quantity'] - df_temp['QuantityCanceled'])
price_temp = price_temp.apply(lambda x:x if x > 0 else 0)
df_copy.loc[:, col] = price_temp
df_copy[col].fillna(0, inplace = True)
df_copy[['InvoiceNo', 'Description',
'categ_product', 'categ_0', 'categ_1', 'categ_2', 'categ_3','categ_4']][:10]
Created a new dataframe that contains, for each order, the amount of the basket, as well as the way it is distributed over the 5 categories of products:
temp = df_copy.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['TotalPrice'].sum()
price_of_basket = temp.rename(columns = {'TotalPrice':'Basket Price'})
for i in range(5):
col = 'categ_{}'.format(i)
temp = df_copy.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)[col].sum()
price_of_basket.loc[:, col] = temp
df_copy['InvoiceDate_int'] = df_copy['InvoiceDate'].astype('int64')
temp = df_copy.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate_int'].mean()
df_copy.drop('InvoiceDate_int', axis = 1, inplace = True)
price_of_basket.loc[:, 'InvoiceDate'] = pd.to_datetime(temp['InvoiceDate_int'])
price_of_basket = price_of_basket[price_of_basket['Basket Price'] > 0]
price_of_basket.sort_values('CustomerID', ascending = True)[:5]
print(price_of_basket['InvoiceDate'].min(), '->', price_of_basket['InvoiceDate'].max())
set_entrainement = price_of_basket[price_of_basket['InvoiceDate'] < pd.Timestamp(datetime.date(2011,10,1))]
set_test = price_of_basket[price_of_basket['InvoiceDate'] >= pd.Timestamp(datetime.date(2011,10,1))]
price_of_basket = set_entrainement.copy(deep = True)
set_test
Consumer order combinations
# of visits and stats on cart amount / users
transactions_per_user=price_of_basket.groupby(by=['CustomerID'])['Basket Price'].agg(['count','min',
'max','mean','sum'])
for i in range(5):
col = 'categ_{}'.format(i)
transactions_per_user.loc[:,col] = price_of_basket.groupby(by=['CustomerID'])[col].sum() /\
transactions_per_user['sum']*100
transactions_per_user.reset_index(drop = False, inplace = True)
price_of_basket.groupby(by=['CustomerID'])['categ_0'].sum()
transactions_per_user.sort_values('CustomerID', ascending = True)[:5]
Defining two additional variables that give the number of days elapsed since the first purchase ( FirstPurchase ) and the number of days since the last purchase ( LastPurchase ):
last_date = price_of_basket['InvoiceDate'].max().date()
first_registration = pd.DataFrame(price_of_basket.groupby(by=['CustomerID'])['InvoiceDate'].min())
last_purchase = pd.DataFrame(price_of_basket.groupby(by=['CustomerID'])['InvoiceDate'].max())
test = first_registration.applymap(lambda x:(last_date - x.date()).days)
test2 = last_purchase.applymap(lambda x:(last_date - x.date()).days)
transactions_per_user.loc[:, 'LastPurchase'] = test2.reset_index(drop = False)['InvoiceDate']
transactions_per_user.loc[:, 'FirstPurchase'] = test.reset_index(drop = False)['InvoiceDate']
transactions_per_user[:5]
n1 = transactions_per_user[transactions_per_user['count'] == 1].shape[0]
n2 = transactions_per_user.shape[0]
print("No. customers with single purchase: {:<2}/{:<5} ({:<2.2f}%)".format(n1,n2,n1/n2*100))
list_cols = ['count','min','max','mean','categ_0','categ_1','categ_2','categ_3','categ_4']
selected_customers = transactions_per_user.copy(deep = True)
range_clus = selected_customers[list_cols].values
scaler = StandardScaler()
scaler.fit(range_clus)
print('variables mean values: \n' + 90*'-' + '\n' , scaler.mean_)
scaled_matrix = scaler.transform(range_clus)
define a base of smaller dimension allowing to describe the scaled_matrix matrix. In this case, I will use this base in order to create a representation of the different clusters and thus verify the quality of the separation of the different groups. I therefore perform a PCA beforehand:
pca = PCA()
pca.fit(scaled_matrix)
pca_samples = pca.transform(scaled_matrix)
fig, ax = plt.subplots(figsize=(14, 5))
sns.set(font_scale=1)
plt.step(range(range_clus.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid',
label='cumulative explained variance')
sns.barplot(np.arange(1,range_clus.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g',
label='individual explained variance')
plt.xlim(0, 10)
ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()])
plt.ylabel('Explained variance', fontsize = 14)
plt.xlabel('Principal components', fontsize = 14)
plt.legend(loc='best', fontsize = 13);
Creating customer categories
n_clusters = 11
kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=100)
kmeans.fit(scaled_matrix)
clusters_clients = kmeans.predict(scaled_matrix)
silhouette_avg = silhouette_score(scaled_matrix, clusters_clients)
print('silhouette score: {:<.3f}'.format(silhouette_avg))
checking to see number of customer per each cluster
pd.DataFrame(pd.Series(clusters_clients).value_counts(), columns = ['number of clients']).T
pca = PCA(n_components=6)
matrix_3D = pca.fit_transform(scaled_matrix)
mat = pd.DataFrame(matrix_3D)
mat['cluster'] = pd.Series(clusters_clients)
a. Visualize the clusters using PCA
import matplotlib.patches as mpatches
sns.set_style("white")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2.5})
LABEL_COLOR_MAP = {0:'r', 1:'tan', 2:'b', 3:'k', 4:'c', 5:'g', 6:'deeppink', 7:'skyblue', 8:'darkcyan',
9:'orange',
10:'yellow', 11:'tomato', 12:'seagreen'}
label_color = [LABEL_COLOR_MAP[l] for l in mat['cluster']]
fig = plt.figure(figsize = (12,10))
increment = 0
for ix in range(6):
for iy in range(ix+1, 6):
increment += 1
ax = fig.add_subplot(4,3,increment)
ax.scatter(mat[ix], mat[iy], c= label_color, alpha=0.5)
plt.ylabel('PCA {}'.format(iy+1), fontsize = 12)
plt.xlabel('PCA {}'.format(ix+1), fontsize = 12)
ax.yaxis.grid(color='lightgray', linestyle=':')
ax.xaxis.grid(color='lightgray', linestyle=':')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
if increment == 12: break
if increment == 12: break
#_______________________________________________
# I set the legend: abreviation -> airline name
comp_handler = []
for i in range(n_clusters):
comp_handler.append(mpatches.Patch(color = LABEL_COLOR_MAP[i], label = i))
plt.legend(handles=comp_handler, bbox_to_anchor=(1.1, 0.9),
title='Cluster',
shadow = True, frameon = True, framealpha = 1,
fontsize = 13, bbox_transform = plt.gcf().transFigure) #facecolor = 'lightgrey',
plt.tight_layout()
Score of the silhouette intra-cluster
sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)
sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)
graph_component_silhouette(n_clusters, [-0.15, 0.55], len(scaled_matrix), sample_silhouette_values,
clusters_clients)
Radar charts
At this stage, I have verified that the different clusters are indeed disjoint (at least, in a global way). It remains to understand the habits of the customers in each cluster. To do so, I start by adding to the selected_customers dataframe a variable that defines the cluster to which each client belongs:
selected_customers.loc[:,'cluster'] = clusters_clients
merged_df = pd.DataFrame()
for i in range(n_clusters):
test = pd.DataFrame(selected_customers[selected_customers['cluster'] == i].mean())
test = test.T.set_index('cluster', drop = True)
test['size'] = selected_customers[selected_customers['cluster'] == i].shape[0]
merged_df = pd.concat([merged_df, test])
#_____________________________________________________
merged_df.drop('CustomerID', axis = 1, inplace = True)
print('number of customers:', merged_df['size'].sum())
merged_df = merged_df.sort_values('sum')
Re ordering the df by ordering different types of clusters in relation with total amount spent and product category
liste_index = []
for i in range(5):
column = 'categ_{}'.format(i)
liste_index.append(merged_df[merged_df[column] > 45].index.values[0])
liste_index_reordered = liste_index
liste_index_reordered += [ s for s in merged_df.index if s not in liste_index]
merged_df = merged_df.reindex(index = liste_index_reordered)
merged_df = merged_df.reset_index(drop = False)
display(merged_df[['cluster', 'count', 'min', 'max', 'mean', 'sum', 'categ_0',
'categ_1', 'categ_2', 'categ_3', 'categ_4', 'size']])
Customers graphical representation
def _scale_data(data, ranges):
(x1, x2) = ranges[0]
d = data[0]
return [(d - y1) / (y2 - y1) * (x2 - x1) + x1 for d, (y1, y2) in zip(data, ranges)]
class RadarChart():
def __init__(self, fig, location, sizes, variables, ranges, n_ordinate_levels = 6):
angles = np.arange(0, 360, 360./len(variables))
ix, iy = location[:] ; size_x, size_y = sizes[:]
axes = [fig.add_axes([ix, iy, size_x, size_y], polar = True,
label = "axes{}".format(i)) for i in range(len(variables))]
_, text = axes[0].set_thetagrids(angles, labels = variables)
for txt, angle in zip(text, angles):
if angle > -1 and angle < 181:
txt.set_rotation(angle - 90)
else:
txt.set_rotation(angle - 270)
for ax in axes[1:]:
ax.patch.set_visible(False)
ax.xaxis.set_visible(False)
ax.grid("off")
for i, ax in enumerate(axes):
grid = np.linspace(*ranges[i],num = n_ordinate_levels)
grid_label = [""]+["{:.0f}".format(x) for x in grid[1:-1]]
ax.set_rgrids(grid, labels = grid_label, angle = angles[i])
ax.set_ylim(*ranges[i])
self.angle = np.deg2rad(np.r_[angles, angles[0]])
self.ranges = ranges
self.ax = axes[0]
def plot(self, data, *args, **kw):
sdata = _scale_data(data, self.ranges)
self.ax.plot(self.angle, np.r_[sdata, sdata[0]], *args, **kw)
def fill(self, data, *args, **kw):
sdata = _scale_data(data, self.ranges)
self.ax.fill(self.angle, np.r_[sdata, sdata[0]], *args, **kw)
def legend(self, *args, **kw):
self.ax.legend(*args, **kw)
def title(self, title, *args, **kw):
self.ax.text(0.9, 1, title, transform = self.ax.transAxes, *args, **kw)
fig = plt.figure(figsize=(50,50))
attributes = ['count', 'mean', 'sum', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4']
ranges = [[0.01, 10], [0.01, 1500], [0.01, 10000], [0.01, 75], [0.01, 75], [0.01, 75], [0.01, 75], [0.01, 75]]
index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
n_groups = n_clusters ; i_cols = 3
i_rows = n_groups//i_cols
size_x, size_y = (1/i_cols), (1/i_rows)
for ind in range(n_clusters):
ix = ind%3 ; iy = i_rows - ind//3
pos_x = ix*(size_x + 0.05) ; pos_y = iy*(size_y + 0.05)
location = [pos_x, pos_y] ; sizes = [size_x, size_y]
data = np.array(merged_df.loc[index[ind], attributes])
radar = RadarChart(fig, location, sizes, attributes, ranges)
radar.plot(data, color = 'b', linewidth=5.0)
radar.fill(data, alpha = 0.2, color = 'b')
radar.title(title = 'cluster n{}'.format(index[ind]), color = 'r')
ind += 1
Classification of customers : In this part, the objective will be to adjust a classifier that will classify consumers in the different client categories that were established in the previous section
class Class_Fit(object):
def __init__(self, clf, params=None):
if params:
self.clf = clf(**params)
else:
self.clf = clf()
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def grid_search(self, parameters, Kfold):
self.grid = GridSearchCV(estimator = self.clf, param_grid = parameters, cv = Kfold)
def grid_fit(self, X, Y):
self.grid.fit(X, Y)
def grid_predict(self, X, Y):
self.predictions = self.grid.predict(X)
print("Precision: {:.2f} % ".format(100*metrics.accuracy_score(Y, self.predictions)))
selected_customers.head()
columns = ['mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4' ]
X = selected_customers[columns]
Y = selected_customers['cluster']
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, train_size = 0.8)
SVC
svc = Class_Fit(clf = svm.LinearSVC)
svc.grid_search(parameters = [{'C':np.logspace(-2,2,10)}], Kfold = 10)
svc.grid_fit(X = X_train, Y = Y_train)
svc.grid_predict(X_test, Y_test)
Confusion matrix
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
#_________________________________________________
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
class_names = [i for i in range(11)]
cnf_matrix = confusion_matrix(Y_test, svc.predictions)
np.set_printoptions(precision=2)
plt.figure(figsize = (8,8))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize = False, title='Confusion matrix')
SVC learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 10)):
"""Generate a simple plot of the test and training learning curve"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.legend(loc="best")
return plt
g = plot_learning_curve(svc.grid.best_estimator_,
"SVC learning curves", X_train, Y_train, ylim = [1.01, 0.6],
cv = 5, train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 1])
On this curve, we can see that the train and cross-validation curves converge towards the same limit when the sample size increases. This is typical of modeling with low variance and proves that the model does not suffer from overfitting. Also, we can see that the accuracy of the training curve is correct which is synonymous of a low bias. Hence the model does not underfit the data.
Logistic Regression
lr = Class_Fit(clf = linear_model.LogisticRegression)
lr.grid_search(parameters = [{'C':np.logspace(-2,2,20)}], Kfold = 5)
lr.grid_fit(X = X_train, Y = Y_train)
lr.grid_predict(X_test, Y_test)
g = plot_learning_curve(lr.grid.best_estimator_, "Logistic Regression learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
KNN
knn = Class_Fit(clf = neighbors.KNeighborsClassifier)
knn.grid_search(parameters = [{'n_neighbors': np.arange(1,50,1)}], Kfold = 5)
knn.grid_fit(X = X_train, Y = Y_train)
knn.grid_predict(X_test, Y_test)
g = plot_learning_curve(knn.grid.best_estimator_, "Nearest Neighbors learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
Decison Tree
tr = Class_Fit(clf = tree.DecisionTreeClassifier)
tr.grid_search(parameters = [{'criterion' : ['entropy', 'gini'], 'max_features' :['sqrt', 'log2']}], Kfold = 5)
tr.grid_fit(X = X_train, Y = Y_train)
tr.grid_predict(X_test, Y_test)
g = plot_learning_curve(tr.grid.best_estimator_, "Decision tree learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
Random Forest
rf = Class_Fit(clf = ensemble.RandomForestClassifier)
param_grid = {'criterion' : ['entropy', 'gini'], 'n_estimators' : [20, 40, 60, 80, 100],
'max_features' :['sqrt', 'log2']}
rf.grid_search(parameters = param_grid, Kfold = 5)
rf.grid_fit(X = X_train, Y = Y_train)
rf.grid_predict(X_test, Y_test)
g = plot_learning_curve(rf.grid.best_estimator_, "Random Forest learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
Ada Boost Classifier
ada = Class_Fit(clf = AdaBoostClassifier)
param_grid = {'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
ada.grid_search(parameters = param_grid, Kfold = 5)
ada.grid_fit(X = X_train, Y = Y_train)
ada.grid_predict(X_test, Y_test)
g = plot_learning_curve(ada.grid.best_estimator_, "AdaBoost learning curves", X_train, Y_train,
ylim = [1.01, 0.4], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
Gradient Boosting Classifier
gb = Class_Fit(clf = ensemble.GradientBoostingClassifier)
param_grid = {'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
gb.grid_search(parameters = param_grid, Kfold = 5)
gb.grid_fit(X = X_train, Y = Y_train)
gb.grid_predict(X_test, Y_test)
g = plot_learning_curve(gb.grid.best_estimator_, "Gradient Boosting learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
Conclusion
rf_best = ensemble.RandomForestClassifier(**rf.grid.best_params_)
gb_best = ensemble.GradientBoostingClassifier(**gb.grid.best_params_)
svc_best = svm.LinearSVC(**svc.grid.best_params_)
tr_best = tree.DecisionTreeClassifier(**tr.grid.best_params_)
knn_best = neighbors.KNeighborsClassifier(**knn.grid.best_params_)
lr_best = linear_model.LogisticRegression(**lr.grid.best_params_)
votingC = ensemble.VotingClassifier(estimators=[('rf', rf_best),('gb', gb_best),
('knn', knn_best)], voting='soft')
votingC = votingC.fit(X_train, Y_train)
predictions = votingC.predict(X_test)
print("Precision: {:.2f} % ".format(100*metrics.accuracy_score(Y_test, predictions)))
Testing Predictions
price_of_basket = set_test.copy(deep = True)
transactions_per_user=price_of_basket.groupby(by=['CustomerID'])['Basket Price'].agg(['count','min','max','mean','sum'])
for i in range(5):
col = 'categ_{}'.format(i)
transactions_per_user.loc[:,col] = price_of_basket.groupby(by=['CustomerID'])[col].sum() /\
transactions_per_user['sum']*100
transactions_per_user.reset_index(drop = False, inplace = True)
price_of_basket.groupby(by=['CustomerID'])['categ_0'].sum()
#_______________________
# Correcting time range
transactions_per_user['count'] = 5 * transactions_per_user['count']
transactions_per_user['sum'] = transactions_per_user['count'] * transactions_per_user['mean']
transactions_per_user.sort_values('CustomerID', ascending = True)[:5]
list_cols = ['count','min','max','mean','categ_0','categ_1','categ_2','categ_3','categ_4']
matrix_test = transactions_per_user[list_cols].values
scaled_test_matrix = scaler.transform(matrix_test)
Y = kmeans.predict(scaled_test_matrix)
columns = ['mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4' ]
X = transactions_per_user[columns]
classifiers = [(svc, 'Support Vector Machine'),
(lr, 'Logistic Regression'),
(knn, 'k-Nearest Neighbors'),
(tr, 'Decision Tree'),
(rf, 'Random Forest'),
(gb, 'Gradient Boosting')]
#______________________________
for clf, label in classifiers:
print(30*'_', '\n{}'.format(label))
clf.grid_predict(X, Y)
predictions = votingC.predict(X)
print("Precision: {:.2f} % ".format(100*metrics.accuracy_score(Y, predictions)))