2016年5月23日月曜日

【R】 任意の月次新規集客数を指定して将来のユーザ動態をシミュレートする

下記、Rのコード(備忘録用)

#内容:任意の月次新規集客数から将来のユーザ動態をシミュレートする
#必要に応じてパッケージをインストール 下記一例
#install.packages("rpart")
#install.packages("rpart.plot")
#install.packages("partykit")
#install.packages("rattle")
#install.packages("ggplot2")
#install.packages("sna")
library(data.table) #データ読み込み fread
library(dplyr) #SQLのような集計処理 group_by, summarise
library(rpart) #決定木
library(rattle)      # 描画用ライブラリ
library(rpart.plot)  # 描画用ライブラリ
library(partykit)
#ネットワーク作図用
library(nnet)
library(MASS)
library(reshape2)
library(ggplot2)
library(sna)
#月次課金額を計算する際の基準とする
set.now_firstday <- as.Date("2016-02-01")
set.now_endofmonth <- as.Date("2016-03-01")
newusers <- 50000 #新規ユーザ数を任意に設定
ActZrate <- 0.5 #前月act(A~NP)のZの割合(前月ログインユーザの翌月ログイン無率)
#3か月分のdauをそれぞれ読み込む
df.dau_before <- fread("./dau_count_2016-01-01_to_2016-01-31_Total.csv")
df.dau_before <- data.frame(df.dau_before)
df.dau_before <- df.dau_before %>%
  group_by(user_id) %>%
  summarise(start_time = min(start_time))
df.dau_now <- fread("./dau_count_2016-02-01_to_2016-02-29_Total.csv")
df.dau_now <- data.frame(df.dau_now)
df.dau_now <- df.dau_now %>%
  group_by(user_id) %>%
  summarise(start_time = min(start_time))
df.dau_next <- fread("./dau_count_2016-03-01_to_2016-03-31_Total.csv")
df.dau_next <- data.frame(df.dau_next)
df.dau_next <- df.dau_next %>%
  group_by(user_id) %>%
  summarise(start_time = min(start_time))
#2か月分の課金額を読み込む
df.purchase_all <- fread("./purchase_20160201_20160331.csv")
df.purchase_all <- data.frame(df.purchase_all)
#null
df.purchase_all[is.na(df.purchase_all)] <- 0
#登録日から現在までの利用期間を計算
set.basetime <- set.now_endofmonth
df.dau_before$playtime <- set.basetime - as.Date(df.dau_before$start_time)
df.dau_now$playtime <- set.basetime - as.Date(df.dau_now$start_time)
#月次課金額をユーザ別に集計
df.purchase_now <- df.purchase_all %>%
  filter(paid_at < set.now_endofmonth) %>%
  group_by(user_id) %>%
  summarize(total = sum(price))
df.purchase_next <- df.purchase_all %>%
  filter(paid_at >= set.now_endofmonth) %>%
  group_by(user_id) %>%
  summarize(total = sum(price))
#課金額に応じてクラス分けする
df.purchase_now$class <- "G"
df.purchase_now[df.purchase_now$total <= 1000,]$class <- "F"
df.purchase_now[df.purchase_now$total > 1000 & df.purchase_now$total <= 3000,]$class <- "E"
df.purchase_now[df.purchase_now$total > 3000 & df.purchase_now$total <= 10000,]$class <- "D"
df.purchase_now[df.purchase_now$total > 10000 & df.purchase_now$total <= 30000,]$class <- "C"
df.purchase_now[df.purchase_now$total > 30000 & df.purchase_now$total <= 100000,]$class <- "B"
df.purchase_now[df.purchase_now$total > 100000,]$class <- "A"
df.purchase_next$class <- "G"
df.purchase_next[df.purchase_next$total <= 1000,]$class <- "F"
df.purchase_next[df.purchase_next$total > 1000 & df.purchase_next$total <= 3000,]$class <- "E"
df.purchase_next[df.purchase_next$total > 3000 & df.purchase_next$total <= 10000,]$class <- "D"
df.purchase_next[df.purchase_next$total > 10000 & df.purchase_next$total <= 30000,]$class <- "C"
df.purchase_next[df.purchase_next$total > 30000 & df.purchase_next$total <= 100000,]$class <- "B"
df.purchase_next[df.purchase_next$total > 100000,]$class <- "A"
#新規登録ユーザIDを特定
df.newuserid <- df.dau_now %>%
  filter(start_time >= set.now_firstday)
#必要なカラムのみ抽出
#dau
df.dau_before <- df.dau_before %>%
  dplyr::select(user_id, playtime)
df.dau_now <- df.dau_now %>%
  dplyr::select(user_id, playtime)
df.dau_next <- df.dau_next %>%
  dplyr::select(user_id)
df.purchase_now <- df.purchase_now %>%
  dplyr::select(user_id,class)
df.purchase_next <- df.purchase_next %>%
  dplyr::select(user_id,class)
#既存ユーザ属性特定
#join to dau
df.all_now <- left_join(df.dau_now,df.purchase_now,by = "user_id")
df.all_now[is.na(df.all_now)] <- "NP"
#離脱ユーザの特定と新規除外
#z:before > now
df.all_now_old <- left_join(df.dau_before,df.all_now, by = "user_id")
df.all_now_old[is.na(df.all_now_old)] <- "Z"
#np:now > next
df.all_next <- left_join(df.dau_next,df.purchase_next,by = "user_id")
df.all_next[is.na(df.all_next)] <- "NP"
#z:now > next
df.all_now_old <- left_join(df.all_now_old, df.all_next, by = "user_id")
df.all_now_old[is.na(df.all_now_old)] <- "Z"
#rename and select
df.all_old <- df.all_now_old %>%
  dplyr::select(user_id, playtime.x, class.x, class.y)
names(df.all_old) <- c("user_id","playtime","class_now","class_next")
#新規ユーザ属性特定
df.dau_now_new <- left_join(df.newuserid, df.dau_now, by = "user_id")
df.all_now_new <- left_join(df.dau_now_new, df.purchase_now, by = "user_id")
df.all_now_new[is.na(df.all_now_new)] <- "NP"
#z:now > next
df.all_now_new <- left_join(df.all_now_new, df.all_next, by = "user_id")
df.all_now_new[is.na(df.all_now_new)] <- "Z"
#rename and select
df.all_new <- df.all_now_new %>%
  dplyr::select(user_id, playtime.x, class.x, class.y)
names(df.all_new) <- c("user_id","playtime","class_now","class_next")
#新規をrbind
df.all <- rbind(df.all_new, df.all_old)
#決定木モデル構築
#playtimは変数として使うとシミュレーションが困難となるため使用しない
tree <- rpart(class_next ~ class_now,data = df.all,control=rpart.control(minsplit=1, cp=0.0001))
dev.off() #エラー回避→ Error in .Call.graphics(C_palette2, .Call(C_palette2, NULL)) :invalid graphics state
plot(as.party(tree))
plotcp(tree)
#遷移確率
class_now <- df.all$class_now
df.test <- data.frame(class_now)
df.pred <- predict(tree,df.test)
df.pred <- data.frame(df.pred)
df.pred <- cbind(df.test,df.pred) #横に連結
#write.csv(df.pred,"./output/pred.csv")
#確率
df.pred_Pv <- melt(df.pred, id.vars = c("class_now"),variable.name = "class_next", na.rm = TRUE)
df.pred_Pv <- df.pred_Pv %>%
  group_by(class_now,class_next) %>%
  summarise(Pv = mean(value))
write.csv(df.pred_Pv,"./output/pred_Pv.csv")
#初期化 ↓
#initial data
df.initialdata <- df.all_now
df.initialdata <- df.initialdata %>%
  group_by(playtime, class) %>%
  summarise(uu = length(user_id))
names(df.initialdata) <- c("playtime","class_now","uu")
#sum = a~npの合計
anp_before <- sum(df.initialdata$uu)
#newuser
df.newuser_PayPv <- left_join(df.dau_now,df.purchase_now,by = "user_id")
df.newuser_PayPv <- df.newuser_PayPv %>%
  filter(playtime <= 30) #1ヶ月は30日とする
df.newuser_PayPv[is.na(df.newuser_PayPv)] <- "NP"
#newuser Pv
df.newuser_PayPv <- df.newuser_PayPv %>%
  group_by(class) %>%
  summarize(uu = length(user_id))
df.newuser_PayPv$Pv <- df.newuser_PayPv$uu / sum(df.newuser_PayPv$uu)
#設定
df.newuser_PayPv$uu <- df.newuser_PayPv$Pv * newusers
df.newuser_PayPv$playtime <- 1
df.newuser_PayPv <- df.newuser_PayPv %>%
  group_by(playtime, class) %>%
  summarise(uu = sum(uu))
names(df.newuser_PayPv) <- c("playtime","class_now","uu")
df.result <- df.initialdata
df.result$turn <- 0
#simulation 最初の予測
#変数データを用意 playtime class_now
df.sim <- left_join(df.initialdata, df.pred_Pv, by = "class_now")
df.sim$uu_next <- df.sim$uu * df.sim$Pv
df.sim <- df.sim %>%
  group_by(playtime, class_next) %>%
  summarise(uu = sum(uu_next))
names(df.sim) <- c("playtime","class_now","uu")
df.sim$playtime <- df.sim$playtime + 30 #1ヶ月は30日とする
#次の新規挿入
df.sim <- rbind(df.newuser_PayPv,df.sim)
#前月actZrate
df.sim[df.sim$class_now == "Z",]$uu <- df.sim[df.sim$class_now == "Z",]$uu * ActZrate
#初期値に代入
df.initialdata <- df.sim
df.result_test <- df.initialdata
df.result_test$turn <- 1
df.result <- rbind(df.result, df.result_test)
#simulation 2回目以降の予測
#初期化
#予測
for(i in 2:30){ #30ヵ月後までシミュレート(任意)
  #変数データを用意 playtime class_now
  df.sim <- left_join(df.initialdata, df.pred_Pv, by = "class_now")
  df.sim$uu_next <- df.sim$uu * df.sim$Pv
  df.sim <- df.sim %>%
    group_by(playtime, class_next) %>%
    summarise(uu = sum(uu_next))
  names(df.sim) <- c("playtime","class_now","uu")
  df.sim$playtime <- df.sim$playtime + 30 #1ヶ月は30日とする
  #前月actZrate
  df.sim[df.sim$class_now == "Z",]$uu <- df.sim[df.sim$class_now == "Z",]$uu * ActZrate
  #次の新規挿入
  df.sim <- rbind(df.newuser_PayPv,df.sim)
  #初期値に代入
  df.initialdata <- df.sim
  df.result_test <- df.initialdata
  df.result_test$turn <- i
  df.result <- rbind(df.result, df.result_test)
}
#結果出力
write.csv(df.result,"./output/result.csv")
#遷移確率のネットワーク図
df.Pv_netwaork <- acast(df.pred_Pv, class_now ~ class_next, max, value.var = "Pv")
gplot(df.Pv_netwaork,mode = "circle",edge.lwd = df.Pv_netwaork*15, edge.col = "#c8c8cb", displaylabels = TRUE)


2 件のコメント :

  1. 補足:ネットワーク図のとこは単に遷移確率を見える化してるだけなので、それが無くてもシミュレーションは可能

    返信削除
  2. 補足:「ActZrate <- 0.5 #前月act(A~NP)のZの割合(前月ログインユーザの翌月ログイン無率)」は事前に調べて設定する

    返信削除