## Environment : Anaconda-navigator
## Programming Language : Python 3
## Import Pandas
## import seaborn as sns
## import Numpy as np
## import matplotlib pyplot as plt
## from datetime import timedelta
## from datetime import datetime, date
## from scipy.sparse import csr_matrix
## from math import sqrt
## from tqdm import tqdm_notebook as tqdm
## from sklearn.metrics.pairwise import cosine_similarity
## from sklearn.model_selection import train_test_split
## from sklearn.metrics import mean_squared_error
## import plotly.express as px
1. 데이터를 불러오기 및 데이터 전처리
2. 데이터 분석
3. 추천시스템
Factorization Machine
사용하여 추천시스템을 만들어 보겠습니다.
FM에 대한 설명은https://datapractice0815.tistory.com/208이곳에서 확인해주세요.
우선 리소스 문제로인하여 1/100 이로 샘플링 하여 데이터를 사용하겠습니다.
# sample_df = total.sample(frac=0.01, random_state=42)
# sample_df.to_csv('sample_df.csv')
# 리소스 문제로 total data를 1/100로 샘플링
FILES_DIR = './files/'
sample = pd.read_csv(FILES_DIR + 'sample_df.csv')
분석과정에서 나온 데이터들을 전처리하겠습니다.
# binary로 문제를 풀어볼거여서 구매 말고는 -1로
sample_df['rating'] = sample_df['event_name']
re_name = {'rating' : {'click_item':-1, 'like_item':-1, 'add_to_cart':-1, 'purchase_success':1}}
sample_df = sample_df.replace(re_name)
####################################
####################################
sample_df = sample_df[sample_df['gender'] != 'un_gender']
####################################
####################################
sample_df.loc[sample_df['age'] <= 17, 'age(Group)'] = '00 ~ 17'
sample_df.loc[(sample_df['age'] >= 18) &\
(sample_df['age'] <= 24), 'age(Group)'] = '18 ~ 24'
sample_df.loc[(sample_df['age'] >= 25) &\
(sample_df['age'] <= 35), 'age(Group)'] = '25 ~ 35'
sample_df.loc[(sample_df['age'] >= 36) &\
(sample_df['age'] <= 44), 'age(Group)'] = '36 ~ 44'
sample_df.loc[(sample_df['age'] >= 45) &\
(sample_df['age'] <= 54), 'age(Group)'] = '45 ~ 54'
sample_df.loc[sample_df['age'] >= 55, 'age(Group)'] = '55 ~ 99'
####################################
####################################
sample_df['event_timestamp'] = pd.to_datetime(sample_df['event_timestamp'])
sample_df['event_timestamp(weekday)'] = sample_df['event_timestamp'].dt.day_name()
우선 기존 데이터에서 필요한 컬럼만 추출하여 사용하겠습니다.
# 필요한 컬럼만 추출
col_list_1 = ['event_name', 'user_no', 'item_no', 'country', 'region', 'platform',
'event_timestamp_month', 'event_timestamp_day', 'event_timestamp_hour',
'category1_name', 'category2_name', 'gender', 'age', 'age(Group)', 'event_timestamp(weekday)']
raw_data_1 = sample_df[col_list_1]
X와 y데이터를 만들겠습니다.
X_data = pd.get_dummies(raw_data_1, columns=col_list_1)
y_data = sample_df['rating']
X데이터를 csr_matrix로 만들어 주겠습니다.
csr_matrix 자세한 설명은 https://rfriend.tistory.com/551 참고해주세요
import scipy
X_sparse = scipy.sparse.csr_matrix(X_data.values)
Model Equation
Pairwise Interaction to be computed
- Linear Complexity
로스를 정의하는 부분입니다.
negative likelihood 이며 binary classification할때 사용하는 것입니다.
# Compute negative log likelihood between prediction and label
def log_loss(pred, y):
return np.log(np.exp(-pred * y) + 1.0)
gradient를 업데이트 해주겠습니다.
SGD를 사용하겠습니다
# Update gradients
def sgd(X, y, n_samples, n_features,
w0, w, v, n_factors, learning_rate, reg_w, reg_v):
data = X.data
indptr = X.indptr
indices = X.indices
loss = 0.0
for i in range(n_samples):
pred, summed = predict(X, w0, w, v, n_factors, i)
# calculate loss and its gradient
loss += log_loss(pred, y[i])
loss_gradient = -y[i] / (np.exp(y[i] * pred) + 1.0)
# update bias/intercept term
w0 -= learning_rate * loss_gradient
# update weight
for index in range(indptr[i], indptr[i + 1]):
feature = indices[index]
w[feature] -= learning_rate * (loss_gradient * data[index] + 2 * reg_w * w[feature])
# update factor
for factor in range(n_factors):
for index in range(indptr[i], indptr[i + 1]):
feature = indices[index]
term = summed[factor] - v[factor, feature] * data[index]
v_gradient = loss_gradient * data[index] * term
v[factor, feature] -= learning_rate * (v_gradient + 2 * reg_v * v[factor, feature])
loss /= n_samples
return loss
y예측값을 얻기위한 과정입니다.
def predict(X, w0, w, v, n_factors, i):
data = X.data
indptr = X.indptr
indices = X.indices
"""predicting a single instance"""
summed = np.zeros(n_factors)
summed_squared = np.zeros(n_factors)
# linear output w * x
pred = w0
for index in range(indptr[i], indptr[i + 1]):
feature = indices[index]
pred += w[feature] * data[index]
# factor output
for factor in range(n_factors):
for index in range(indptr[i], indptr[i + 1]):
feature = indices[index]
term = v[factor, feature] * data[index]
summed[factor] += term
summed_squared[factor] += term * term
pred += 0.5 * (summed[factor] * summed[factor] - summed_squared[factor])
# gradient update할 때, summed는 독립이므로 re-use 가능
return pred, summed
학습하는 과정입니다.
# Train Factorization Machine
# X -> sparse csr_matrix, y -> label
def fit(X, y, config):
epochs = config['num_epochs']
num_factors = config['num_factors']
learning_rate = config['learning_rate']
reg_weights = config['reg_weights']
reg_features = config['reg_features']
num_samples, num_features = X.shape
weights = np.zeros(num_features) # -> w
global_bias = 0.0 # -> w0
# latent factors for all features -> v
feature_factors = np.random.normal(size = (num_factors, num_features))
epoch_loss = []
for epoch in range(epochs):
loss = sgd(X, y, num_samples, num_features,
global_bias, weights,
feature_factors, num_factors,
learning_rate, reg_weights, reg_features)
print(f'[epoch: {epoch+1}], loss: {loss}')
epoch_loss.append(loss)
return epoch_loss
config = {
"num_epochs": 10,
"num_factors": 10,
"learning_rate": 0.01,
"reg_weights": 0.01,
"reg_features": 0.01
}
epoch_loss = fit(X_sparse_1, y_data.values, config)
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=epoch_loss, mode='lines+markers'))
fig.show()
기본 데이터에서 feature들을 점점 추가하여 FM의 성능을 확인해 보겠습니다.