############################################################### # RFM-KNN ############################################################### ################################################ # 1. EDA & Feature Engineering ################################################ import datetime as dt import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.preprocessing import MinMaxScaler warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) pd.set_option('display.width', 500) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.float_format', lambda x: '%.3f' % x) df_ = pd.read_excel(r"C:\Users\hp\PycharmProjects\VBO\WEEK_03\online_retail_II.xlsx", sheet_name="Year 2010-2011") df = df_.copy() df.head() df.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.90, 0.99]) df.isnull().sum() df = df[~df["Invoice"].str.contains("C", na=False)] df = df[(df['Quantity'] > 0)] df.dropna(inplace=True) df.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.90, 0.99]) df["TotalPrice"] = df["Quantity"] * df["Price"] df.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.90, 0.99]) # RFM Metrics calculation: df["InvoiceDate"].max() today_date = dt.datetime(2011, 12, 11) rfm = df.groupby("Customer ID").agg({"InvoiceDate": lambda date: (today_date - date.max()).days, "Invoice": "nunique", "TotalPrice": "sum"}) rfm.columns = ["Recency", "Frequency", "Monetary"] rfm = rfm[(rfm["Monetary"]) > 0 & (rfm["Frequency"] > 0)] rfm.head() rfm.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.90, 0.99]) ################################## # 2. RFM SCORES ################################## rfm["RecencyScore"] = pd.qcut(rfm["Recency"], 5, labels=[5, 4, 3, 2, 1]) rfm["FrequencyScore"] = pd.qcut(rfm["Frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5]) rfm["MonetaryScore"] = pd.qcut(rfm["Monetary"], 5, labels=[1, 2, 3, 4, 5]) rfm["RFM_SCORE"] = (rfm['RecencyScore'].astype(str) + rfm['FrequencyScore'].astype(str)) # RFM isimlendirmesi seg_map = { r'[1-2][1-2]': 'Hibernating', r'[1-2][3-4]': 'At_Risk', r'[1-2]5': 'Cant_Loose', r'3[1-2]': 'About_to_Sleep', r'33': 'Need_Attention', r'[3-4][4-5]': 'Loyal_Customers', r'41': 'Promising', r'51': 'New_Customers', r'[4-5][2-3]': 'Potential_Loyalists', r'5[4-5]': 'Champions' } rfm['Segment'] = rfm['RecencyScore'].astype(str) + rfm['FrequencyScore'].astype(str) rfm['Segment'] = rfm['Segment'].replace(seg_map, regex=True) rfm.head() rfm[["Segment", "Recency", "Frequency", "Monetary"]].groupby("Segment").agg(["mean", "count","max"]) rfm["Segment"].value_counts() rfm.reset_index(inplace=True) rfm.head() ################################## # 3. SEGMENTATION WITH ML - KMEANS ################################## """ # K-Means ile df_kmeans = pd.DataFrame() df_kmeans["Recency"] = rfm["recency"] df_kmeans["Frequeny"] = rfm["frequency"] df_kmeans["Monetary"] = rfm["monetary"] sc = MinMaxScaler((0, 1)) df_kmeans = sc.fit_transform(df_kmeans) kmeans = KMeans(n_clusters=10) k_fit = kmeans.fit(df_kmeans) # modeli fit ediyoruz. kumeler = k_fit.labels_ k_fit.get_params() # parametreleri getirir. k_fit.n_clusters # cluster sayısı k_fit.cluster_centers_ # bu clusterların merkezleri (8 farklı kümenin merkezi yani 8 farklı gözlem birimi) k_fit.labels_ # tüm gözlem birimlerinin 8 adet sınıfa dağılımı (0dan 7ye kadar = 8 sınıf) k_fit.inertia_ # 8.161555920215077 # Final Cluster'ının Oluşturulması df_kmeans = pd.DataFrame() df_kmeans["Recency"] = rfm["recency"] df_kmeans["Frequeny"] = rfm["frequency"] df_kmeans["Monetary"] = rfm["monetary"] pd.DataFrame({"Customer": df_kmeans.index, "Cluster": kumeler}) # bir dataframe oluşturduk. rfm["Cluster_No"] = kumeler rfm["Cluster_No"] = rfm["Cluster_No"] + 1 # labelların sıfırdan başlıyor olmaması için yaptık. rfm.head() return rfm """ rfm_knn = pd.DataFrame() rfm_knn = rfm[["Customer ID","Recency", "Frequency"]] #Customer ıd indexe atanmalı. rfm_knn.head() kmeans = KMeans() ssd = [] K = range(1, 30) for k in K: kmeans = KMeans(n_clusters=k, random_state=42).fit(rfm_knn) ssd.append(kmeans.inertia_) plt.plot(K, ssd, "bx-") plt.xlabel("Küme Sayısı") plt.title("Optimum Küme sayısı için Elbow Yöntemi") plt.show() kmeans = KMeans(random_state=42) visu = KElbowVisualizer(kmeans, k=(1, 30)) visu.fit(rfm_knn) visu.show() # Final Cluster kmeans = KMeans(n_clusters=4, random_state=42).fit(rfm_knn) clusters = kmeans.labels_ kmeans_final = pd.DataFrame(rfm_knn, columns=["Customer ID","Recency", "Frequency"], index=rfm.index) kmeans_final["kmeans_segment"] = clusters + 1 kmeans_final.head() rfm.head() cross_df = pd.concat([kmeans_final,rfm["Segment"]], axis=1) cross_df.head() cross_df.drop(["Recency","Frequency"], axis=1,inplace=True) cross_table = pd.crosstab(cross_df["kmeans_segment"],cross_df["Segment"]) cross_table # Dendogram scaler = MinMaxScaler((0, 1)) kmeans_final = scaler.fit_transform(kmeans_final) kmrfm_complete = linkage(kmeans_final, "complete") kmrfm_average = linkage(kmeans_final, "average") plt.figure(figsize=(10, 5)) plt.title("Hiyerarşik Kümeleme Dendogramı") plt.xlabel("Gözlem Birimleri") plt.ylabel("Uzaklıklar") dendrogram(kmrfm_complete, leaf_font_size=10) plt.show() plt.figure(figsize=(15, 10)) plt.title("Hiyerarşik Kümeleme Dendogramı") plt.xlabel("Gözlem Birimleri") plt.ylabel("Uzaklıklar") dendrogram(kmrfm_average, truncate_mode="lastp", p=10, show_contracted=True, leaf_font_size=10) plt.show()