본문 바로가기

E-commerce Data 분석

3-1

## Environment : Anaconda-navigator
## Programming Language : Python 3
## Import Pandas
## import seaborn as sns
## import Numpy as np
## import matplotlib pyplot as plt

## from datetime import timedelta

## from datetime import datetime, date
## from scipy.sparse import csr_matrix
## from math import sqrt
## from tqdm import tqdm_notebook as tqdm
## from sklearn.metrics.pairwise import cosine_similarity
## from sklearn.model_selection import train_test_split
## from sklearn.metrics import mean_squared_error
## import plotly.express as px

 

 

1. 데이터를 불러오기 및 데이터 전처리
2. 데이터 분석
3. 추천시스템

 

 

Factorization Machine 

사용하여 추천시스템을 만들어 보겠습니다.

FM에 대한 설명은https://datapractice0815.tistory.com/208이곳에서 확인해주세요.

 

 

우선 리소스 문제로인하여 1/100 이로 샘플링 하여 데이터를 사용하겠습니다.

# sample_df = total.sample(frac=0.01, random_state=42)
# sample_df.to_csv('sample_df.csv')
# 리소스 문제로 total data를 1/100로 샘플링

FILES_DIR = './files/'
sample = pd.read_csv(FILES_DIR + 'sample_df.csv')

 

분석과정에서 나온 데이터들을 전처리하겠습니다.

# binary로 문제를 풀어볼거여서 구매 말고는 -1로
sample_df['rating'] = sample_df['event_name']
re_name = {'rating' : {'click_item':-1, 'like_item':-1, 'add_to_cart':-1, 'purchase_success':1}}
sample_df = sample_df.replace(re_name)
####################################
####################################
sample_df = sample_df[sample_df['gender'] != 'un_gender']
####################################
####################################
sample_df.loc[sample_df['age'] <= 17, 'age(Group)'] = '00 ~ 17'

sample_df.loc[(sample_df['age'] >= 18) &\
               (sample_df['age'] <= 24), 'age(Group)'] = '18 ~ 24'

sample_df.loc[(sample_df['age'] >= 25) &\
               (sample_df['age'] <= 35), 'age(Group)'] = '25 ~ 35'

sample_df.loc[(sample_df['age'] >= 36) &\
               (sample_df['age'] <= 44), 'age(Group)'] = '36 ~ 44'

sample_df.loc[(sample_df['age'] >= 45) &\
               (sample_df['age'] <= 54), 'age(Group)'] = '45 ~ 54'

sample_df.loc[sample_df['age'] >= 55, 'age(Group)'] = '55 ~ 99'
####################################
####################################
sample_df['event_timestamp'] = pd.to_datetime(sample_df['event_timestamp'])
sample_df['event_timestamp(weekday)'] = sample_df['event_timestamp'].dt.day_name()

 

우선 기존 데이터에서 필요한 컬럼만 추출하여 사용하겠습니다.

# 필요한 컬럼만 추출
col_list_1 = ['event_name', 'user_no', 'item_no', 'country', 'region', 'platform', 
'event_timestamp_month', 'event_timestamp_day', 'event_timestamp_hour',
'category1_name', 'category2_name', 'gender', 'age', 'age(Group)', 'event_timestamp(weekday)']

raw_data_1 = sample_df[col_list_1]

 

X와 y데이터를 만들겠습니다.

X_data = pd.get_dummies(raw_data_1, columns=col_list_1)
y_data = sample_df['rating']

 

X데이터를 csr_matrix로 만들어 주겠습니다.

csr_matrix 자세한 설명은 https://rfriend.tistory.com/551 참고해주세요

import scipy
X_sparse = scipy.sparse.csr_matrix(X_data.values)

 

 

Model Equation

 

Pairwise Interaction to be computed

- Linear Complexity

 

로스를 정의하는 부분입니다.

negative likelihood 이며 binary classification할때 사용하는 것입니다.

# Compute negative log likelihood between prediction and label
def log_loss(pred, y):
    return np.log(np.exp(-pred * y) + 1.0)

 

gradient를 업데이트 해주겠습니다.

SGD를 사용하겠습니다

# Update gradients
def sgd(X, y, n_samples, n_features,
                w0, w, v, n_factors, learning_rate, reg_w, reg_v):
    data = X.data
    indptr = X.indptr
    indices = X.indices
    loss = 0.0

    for i in range(n_samples):
        pred, summed = predict(X, w0, w, v, n_factors, i)
        
        # calculate loss and its gradient
        loss += log_loss(pred, y[i])
        loss_gradient = -y[i] / (np.exp(y[i] * pred) + 1.0)
    
        # update bias/intercept term
        w0 -= learning_rate * loss_gradient

        # update weight
        for index in range(indptr[i], indptr[i + 1]):
            feature = indices[index]
            w[feature] -= learning_rate * (loss_gradient * data[index] + 2 * reg_w * w[feature])

        # update factor
        for factor in range(n_factors):
            for index in range(indptr[i], indptr[i + 1]):
                feature = indices[index]
                term = summed[factor] - v[factor, feature] * data[index]
                v_gradient = loss_gradient * data[index] * term
                v[factor, feature] -= learning_rate * (v_gradient + 2 * reg_v * v[factor, feature])
    
    loss /= n_samples
    return loss

 

y예측값을 얻기위한 과정입니다.

 

def predict(X, w0, w, v, n_factors, i):
    data = X.data
    indptr = X.indptr
    indices = X.indices
    """predicting a single instance"""
    summed = np.zeros(n_factors)
    summed_squared = np.zeros(n_factors)

    # linear output w * x
    pred = w0
    for index in range(indptr[i], indptr[i + 1]):
        feature = indices[index]
        pred += w[feature] * data[index]

    # factor output
    for factor in range(n_factors):
        for index in range(indptr[i], indptr[i + 1]):
            feature = indices[index]
            term = v[factor, feature] * data[index]
            summed[factor] += term
            summed_squared[factor] += term * term

        pred += 0.5 * (summed[factor] * summed[factor] - summed_squared[factor])

    # gradient update할 때, summed는 독립이므로 re-use 가능
    return pred, summed

 

학습하는 과정입니다.

# Train Factorization Machine
# X -> sparse csr_matrix, y -> label
def fit(X, y, config):
    epochs = config['num_epochs']
    num_factors = config['num_factors']
    learning_rate = config['learning_rate']
    reg_weights = config['reg_weights']
    reg_features = config['reg_features']

    num_samples, num_features = X.shape
    weights = np.zeros(num_features) # -> w
    global_bias = 0.0 # -> w0
    
    # latent factors for all features -> v
    feature_factors = np.random.normal(size = (num_factors, num_features))

    epoch_loss = []
    for epoch in range(epochs):
        loss = sgd(X, y, num_samples, num_features,
                            global_bias, weights,
                            feature_factors, num_factors,
                            learning_rate, reg_weights, reg_features)
        print(f'[epoch: {epoch+1}], loss: {loss}')

        epoch_loss.append(loss)
      
    return epoch_loss
config = {
    "num_epochs": 10,
    "num_factors": 10,
    "learning_rate": 0.01,
    "reg_weights": 0.01,
    "reg_features": 0.01
}
epoch_loss = fit(X_sparse_1, y_data.values, config)

import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=epoch_loss, mode='lines+markers'))
fig.show()

 

기본 데이터에서 feature들을 점점 추가하여 FM의 성능을 확인해 보겠습니다.

반응형

'E-commerce Data 분석' 카테고리의 다른 글

3-3  (0) 2023.05.21
3-2  (0) 2023.05.20
2-7  (0) 2023.05.20
2-6  (0) 2023.05.20
2-5  (2) 2023.05.17