import pandas as pd
import numpy as np

# %pip install lifetimes

SELECT CustomerID
     , COUNT(DISTINCT DATE(TransactionDate)) - 1 AS frequency
     , DATE_DIFF(DATE(MAX(TransactionDate)), DATE(MIN(TransactionDate)), DAY) AS recency

     # discount된걸 고려함
     , AVG(TotalAmount) AS monetary_value

     # 현재날짜를 2024년 5월 1일이라고 가정
     , DATE_DIFF('2024-05-01', DATE(MIN(TransactionDate)), DAY) AS T
FROM `eminent-ring-451902-n9.ai.retail_transaction`
GROUP BY CustomerID
HAVING frequency > 0
LIMIT 1000;

df = pd.read_csv('/content/sample_data/retail_freqmoreone.csv', index_col=0)
df.head()    # frequency = 재구매/추가 구매 횟수  //  recency = 첫구매와 마지막 구매 사이의 기간 // monetary = 평균 구매 금액 (한번의 구매에서) // T = 활동기간(첫 구매부터 지금까지)

data = df[['frequency', 'recency', 'T']]
data.head()

print(data[['frequency', 'recency', 'T']].describe())
print(data[['frequency', 'recency', 'T']].isnull().sum())

         frequency      recency            T
count  1000.000000  1000.000000  1000.000000
mean      1.046000   123.748000   249.207000
std       0.218934    85.765122    85.725685
min       1.000000     1.000000    13.000000
25%       1.000000    51.000000   185.000000
50%       1.000000   112.000000   263.000000
75%       1.000000   185.000000   321.250000
max       3.000000   356.000000   367.000000
frequency    0
recency      0
T            0
dtype: int64

from lifetimes import BetaGeoFitter     # BetaGeoFitter는 frequency, recency, T를 입력값으로 삼음.


# BG/NBD 모델 객체 생성
bgf = BetaGeoFitter(penalizer_coef=0.01)

# 데이터에 있는 frequency, recency, T를 이용해 모델 fitting
bgf.fit(data['frequency'], data['recency'], data['T'])

# 모델 파라미터 출력
bgf

<lifetimes.BetaGeoFitter: fitted with 1000 subjects, a: 0.66, alpha: 229.96, b: 0.06, r: 2.26>

bgf.summary

from lifetimes.plotting import plot_frequency_recency_matrix

plot_frequency_recency_matrix(bgf)

<Axes: title={'center': 'Expected Number of Future Purchases for 1 Unit of Time,\nby Frequency and Recency of a Customer'}, xlabel="Customer's Historical Frequency", ylabel="Customer's Recency">

from lifetimes.plotting import plot_probability_alive_matrix

plot_probability_alive_matrix(bgf)

<Axes: title={'center': 'Probability Customer is Alive,\nby Frequency and Recency of a Customer'}, xlabel="Customer's Historical Frequency", ylabel="Customer's Recency">

# conditional_expected_number_of_purchases_up_to_time(t, frequency, recency, T) 메서드 활용
t = 7  # 앞으로 일주일 동안의 구매 예측
data['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, data['frequency'], data['recency'], data['T'])



# 정렬하고, 상위 n 명 출력
data.sort_values(by='predicted_purchases', ascending=False).head(5)  # 여기선 상위 5명

# 앞으로 30일 동안
t = 30
data['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, data['frequency'], data['recency'], data['T'])



# 정렬하고, 상위 n 명 출력
data.sort_values(by='predicted_purchases', ascending=False).head(5)

for t in [7, 30, 90]:
    col = f'predicted_purchases_{t}d'
    data[col] = bgf.conditional_expected_number_of_purchases_up_to_time(
        t,
        data['frequency'],
        data['recency'],
        data['T']
    )

# 앞으로 90일 동안 예측
t = 90
data['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, data['frequency'], data['recency'], data['T'])



# 정렬하고, 상위 n 명 출력
data.sort_values(by='predicted_purchases', ascending=False).head(5)

# 간단한 기준으로 등급 분류
def classify_customer(x):
    if x > 0.10:
        return 'VIP'
    elif x > 0.05:
        return 'Potential'
    elif x > 0.01:
        return 'Passive'
    else:
        return 'Churn Risk'


data['segment'] = data['predicted_purchases_30d'].apply(classify_customer)

# 분포 확인
data['segment'].value_counts()

import matplotlib.pyplot as plt
import seaborn as sns

# 세그먼트별 색상 지정
segment_palette = {
    'VIP': '#FF6B6B',         # Red
    'Potential': '#FFD93D',   # Yellow
    'Passive': '#6BCB77',     # Green
    'Churn Risk': '#4D96FF'   # Blue
}

# 시각화
plt.figure(figsize=(8, 5))
sns.countplot(data=data, x='segment', order=['VIP', 'Potential', 'Passive', 'Churn Risk'], palette=segment_palette)

plt.title('Customer Segment Distribution (30-day Prediction)', fontsize=14)
plt.xlabel('Segment', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()

from lifetimes.plotting import plot_period_transactions
plot_period_transactions(bgf)

<Axes: title={'center': 'Frequency of Repeat Transactions'}, xlabel='Number of Calibration Period Transactions', ylabel='Customers'>

t = 10 # 30일간의 예측

individual = data.iloc[20]
bgf.predict(t, individual['frequency'], individual['recency'], individual['T'])

np.float64(0.005807447546305386)

transaction_data = pd.read_csv('/content/sample_data/Retail_Transaction_Dataset.csv')
transaction_data.head()

from lifetimes.plotting import plot_history_alive


id = 294753
days_since_birth = 400

sp_trans = transaction_data.loc[transaction_data['CustomerID'] == id]
plot_history_alive(bgf, days_since_birth, sp_trans, 'TransactionDate')

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
/usr/local/lib/python3.11/dist-packages/matplotlib/axis.py in convert_units(self, x)
   1821         try:
-> 1822             ret = self._converter.convert(x, self.units, self)
   1823         except Exception as e:

/usr/local/lib/python3.11/dist-packages/matplotlib/dates.py in convert(self, *args, **kwargs)
   1833     def convert(self, *args, **kwargs):
-> 1834         return self._get_converter().convert(*args, **kwargs)
   1835 

/usr/local/lib/python3.11/dist-packages/matplotlib/dates.py in convert(value, unit, axis)
   1761         """
-> 1762         return date2num(value)
   1763 

/usr/local/lib/python3.11/dist-packages/matplotlib/dates.py in date2num(d)
    443             return d
--> 444         tzi = getattr(d[0], 'tzinfo', None)
    445         if tzi is not None:

IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

The above exception was the direct cause of the following exception:

ConversionError                           Traceback (most recent call last)
<ipython-input-107-ac644562725d> in <cell line: 0>()
      6 
      7 sp_trans = transaction_data.loc[transaction_data['CustomerID'] == id]
----> 8 plot_history_alive(bgf, days_since_birth, sp_trans, 'TransactionDate')

/usr/local/lib/python3.11/dist-packages/lifetimes/plotting.py in plot_history_alive(model, t, transactions, datetime_col, freq, start_date, ax, **kwargs)
    382     plt.ylim(0, 1.0)
    383     plt.yticks(np.arange(0, 1.1, 0.1))
--> 384     plt.xlim(start_date, path_dates[-1])
    385     plt.legend(loc=3)
    386     plt.ylabel("P_alive")

/usr/local/lib/python3.11/dist-packages/matplotlib/pyplot.py in xlim(*args, **kwargs)
   2121     if not args and not kwargs:
   2122         return ax.get_xlim()
-> 2123     ret = ax.set_xlim(*args, **kwargs)
   2124     return ret
   2125 

/usr/local/lib/python3.11/dist-packages/matplotlib/axes/_base.py in set_xlim(self, left, right, emit, auto, xmin, xmax)
   3781                 raise TypeError("Cannot pass both 'right' and 'xmax'")
   3782             right = xmax
-> 3783         return self.xaxis._set_lim(left, right, emit=emit, auto=auto)
   3784 
   3785     get_xscale = _axis_method_wrapper("xaxis", "get_scale")

/usr/local/lib/python3.11/dist-packages/matplotlib/axis.py in _set_lim(self, v0, v1, emit, auto)
   1225 
   1226         self.axes._process_unit_info([(name, (v0, v1))], convert=False)
-> 1227         v0 = self.axes._validate_converted_limits(v0, self.convert_units)
   1228         v1 = self.axes._validate_converted_limits(v1, self.convert_units)
   1229 

/usr/local/lib/python3.11/dist-packages/matplotlib/axes/_base.py in _validate_converted_limits(self, limit, convert)
   3697         """
   3698         if limit is not None:
-> 3699             converted_limit = convert(limit)
   3700             if isinstance(converted_limit, np.ndarray):
   3701                 converted_limit = converted_limit.squeeze()

/usr/local/lib/python3.11/dist-packages/matplotlib/axis.py in convert_units(self, x)
   1822             ret = self._converter.convert(x, self.units, self)
   1823         except Exception as e:
-> 1824             raise munits.ConversionError('Failed to convert value(s) to axis '
   1825                                          f'units: {x!r}') from e
   1826         return ret

ConversionError: Failed to convert value(s) to axis units: '2/8/2024 7:51'

# monetary_value까지 있는 데이터셋 불러오기
# 최소 한번의 재구매를 한 사람들만 (재방문 고객들 대상!)
df.head()

# 재방문 고객들 사이에서 'monetary_value'와 'frequency'의 연관성
df[['monetary_value', 'frequency']].corr()

# 연관성을 확인했으면, 모델 피팅
from lifetimes import GammaGammaFitter

ggf = GammaGammaFitter(penalizer_coef = 0.01)
ggf.fit(df['frequency'],               # *여기서 학습시킨건 재방문고객들 데이터만임
        df['monetary_value'])
ggf

<lifetimes.GammaGammaFitter: fitted with 1000 subjects, p: 4.02, q: 0.38, v: 3.80>

# ggf가 제공하는 메서드로 "각 고객이 향후 한번 구매할때 기대되는 평균 구매 금액"을 예측
result = ggf.conditional_expected_average_profit(
        df['frequency'],                  # *근데 여기선 전체 고객들(재방문 안한사람들도 포함)을 대상으로 예측
        df['monetary_value']
    )         # 각 고객ID별로 예측된 평균 구매금액이 시리즈 형태로

result.head(10)

plt.figure(figsize=(8, 5))
sns.histplot(result, bins=40, kde=True, color='skyblue')
plt.title('Predicted Average Purchase Value Distribution (Gamma-Gamma)')
plt.xlabel('Predicted Average Value')
plt.ylabel('Number of Customers')
plt.grid(axis='y', linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()

import pandas as pd

temp = pd.DataFrame({
    'segment': data['segment'],
    'expected_avg_value': result,
    'predicted_purchases_30d': data['predicted_purchases_30d']
})



plt.figure(figsize=(8, 5))
sns.boxplot(
    data=temp,
    x='segment',
    y='expected_avg_value',
    order=['VIP', 'Potential', 'Passive', 'Churn Risk'],
    palette=segment_palette
)

plt.title('Predicted Average Purchase Value by Segment')
plt.xlabel('Customer Segment')
plt.ylabel('Predicted Average Purchase Value')
plt.grid(axis='y', linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 5))
sns.scatterplot(
    data=temp,
    x='predicted_purchases_30d',
    y='expected_avg_value',
    hue='segment',
    palette=segment_palette
)

plt.title('Predicted Purchase Frequency vs. Average Order Value')
plt.xlabel('Expected Purchases in 30 Days')
plt.ylabel('Predicted Average Purchase Value')
plt.grid(True, linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()

# 전체 고객에 대한 예측 구매금액액 평균
# Expected conditional avg profit(모델이 예측한 평균금액) VS. Average profit(실제 데이터에서 측정된 평균금액)
print("Expected conditional average profit: %s, Average profit: %s" % (
    ggf.conditional_expected_average_profit(
        df['frequency'],
        df1['monetary_value']
    ).mean(),
    df[df['frequency']>0]['monetary_value'].mean()
))

Expected conditional average profit: 192.68685396004423, Average profit: 254.40209095919482

	frequency	recency	monetary_value	T
CustomerID
121413	1	103	448.843328	131
818911	1	79	258.033574	104
418277	1	195	448.615587	349
628270	1	67	241.698151	332
147124	1	335	196.334731	342

	frequency	recency	T
CustomerID
121413	1	103	131
818911	1	79	104
418277	1	195	349
628270	1	67	332
147124	1	335	342

	coef	se(coef)	lower 95% bound	upper 95% bound
r	2.260015	0.125195	2.014633	2.505396
alpha	229.960700	16.886742	196.862685	263.058715
a	0.662256	0.149945	0.368363	0.956149
b	0.055920	0.015328	0.025877	0.085962

	frequency	recency	T	predicted_purchases
CustomerID
340516	3	204	208	0.062279
294753	2	74	83	0.054933
892820	3	195	242	0.049623
693531	2	136	143	0.047032
818275	2	136	148	0.045346

	frequency	recency	T	predicted_purchases
CustomerID
340516	3	204	208	0.259558
294753	2	74	83	0.225307
892820	3	195	242	0.207212
693531	2	136	143	0.194194
818275	2	136	148	0.187319

재구매 및 금액 예측 기반 고객 가치 분석 프로젝트¶

1. 프로젝트 개요¶

분석의 목표:¶

2. 데이터 준비¶

- 구글 빅쿼리 이용해서 필요한 데이터를 분석에 맞게 출력¶

3. BG/NBD 모델¶

1. 모델 파라미터 해석:¶

2. penalizer_coef(정규화 계수):¶

BG/NBD 모델로 예측된 미래 구매 횟수의 시각화 (각각의 frequency 축과 recency축에 따라)¶

BG/NBD 모델로 추정한 고객이 아직 활성(Active) 상태일 확률의 시각화¶

BG/NBD 모델을 이용해 고객들이 향후 일정 기간(t=1 단위) 동안 예상 구매 횟수가 높은 순으로 고객을 랭킹하는 방법¶

세그먼트 기준¶

결과:¶

BG/NBD 모델의 적합도(fit)를 시각적으로 평가하는 방법 (2가지를 비교함)¶

개별 고객 단위로 예측과 확률 변화를 구체적으로 확인/조회하는 방법¶

4. Gamma-Gamma 모델이란?¶

중요 포인트: Gamma-Gamma 모델에서 CLV를 계산할때, 'monetary value'와 'purchase frequency' 사이에 관련성은 없다는 전제가 있어야 함.¶

결과:¶

결과:¶

결과:¶

	frequency	recency	T	predicted_purchases	predicted_purchases_7d	predicted_purchases_30d	predicted_purchases_90d
CustomerID
340516	3	204	208	0.728845	0.062279	0.259558	0.728845
294753	2	74	83	0.611746	0.054933	0.225307	0.611746
892820	3	195	242	0.584289	0.049623	0.207212	0.584289
693531	2	136	143	0.534418	0.047032	0.194194	0.534418
818275	2	136	148	0.515993	0.045346	0.187319	0.515993

세그먼트	기준	의미
VIP	> 0.10	다음 30일 내 재구매가 활발하게 예상되는 최상위 고객
Potential	0.05 ~ 0.10	적당한 재구매 가능성 있음
Passive	0.01 ~ 0.05	가능성은 낮지만 살아있는 고객
Churn Risk	≤ 0.01	이탈 가능성이 매우 높은 고객

	CustomerID	ProductID	Quantity	Price	TransactionDate	PaymentMethod	StoreLocation	ProductCategory	DiscountApplied(%)	TotalAmount
0	109318	C	7	80.079844	12/26/2023 12:32	Cash	176 Andrew Cliffs\nBaileyfort, HI 93354	Books	18.677100	455.862764
1	993229	C	4	75.195229	8/5/2023 0:00	Cash	11635 William Well Suite 809\nEast Kara, MT 19483	Home Decor	14.121365	258.306546
2	579675	A	8	31.528816	3/11/2024 18:51	Cash	910 Mendez Ville Suite 909\nPort Lauraland, MO...	Books	15.943701	212.015651
3	799826	D	5	98.880218	10/27/2023 22:00	PayPal	87522 Sharon Corners Suite 500\nLake Tammy, MO...	Books	6.686337	461.343769
4	121413	A	7	93.188512	12/22/2023 11:38	Cash	0070 Michelle Island Suite 143\nHoland, VA 80142	Electronics	4.030096	626.030484

	0
CustomerID
121413	534.960998
818911	309.449982
418277	534.691839
628270	290.143747
147124	236.530392
589251	161.260519
608105	433.630972
201514	228.668338
91439	277.374586
301389	510.046635

	count
segment
Churn Risk	675
Passive	291
VIP	24
Potential	10