total.Rmd

---
title: "Daja Vu, 2021, winter project"
output: github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = F)
```

0. 사용한 패키지 모음

```{r message = F}
library(tidyverse)
library(gridExtra)
library(ggplot2)
library(lubridate)
library(scales)
library(xlsx)
```

---

## Train data를 통해 확인하자.

1. 전체 데이터 확인 작업

```{r}
df <- read.table("origin_data\\funda_train.csv", header = T,  fileEncoding = "UTF-8", sep = ",", stringsAsFactors = F)
```

```{r}
head(df)
tail(df)
dim(df)
df[df$amount > 400000, ]

sum(is.na.data.frame(df))
```

type_of_business에 NULL이 있음<br />
store_id, card_id는 범주형 수니까 패스

```{r}
summary(df[, 3:9])
```

```{r}
attach(df)
unique(type_of_business)
summary(transacted_date)

# transacted_date: 2016-06-01 ~ 2019-02-28
dates = sort(unique(df$transacted_date))
head(dates)
tail(dates)
```

---

2. 맡은 파트별로 카테고리 붙이기

```{r}
ff = c("가정용 세탁업", "간판 및 광고물 제조업", "결혼 상담 및 준비 서비스업", "경영 컨설팅업",
       "그 외 기타 분류 안된 사업지원 서비스업", "기록매체 복제업", "기타 건물 관련설비 설치 공사업",
       "기타 엔지니어링 서비스업", "기타 일반 및 생활 숙박시설 운영업", "스포츠 및 레크리에이션 용품 임대업",
       "애완동물 장묘 및 보호 서비스업", "애완용 동물 및 관련용품 소매업", "여관업", "여행사업", "예식장업",
       "인물사진 및 행사용 영상 촬영업", "자동차 세차업", "자동차 전문 수리업", "자동차 종합 수리업",
       "체형 등 기타 신체관리 서비스업", "통신장비 수리업", "택배업")
# length(ff)
```

```{r}
rectified = c()
for (i in ff) {
  simple = df %>% filter(type_of_business == i)
  simple["cate"] = "서비스업"
  rectified = bind_rows(rectified, simple)
}
rectified[rectified$type_of_business == "택배업", "cate"] = "유통업"
rectified %>% head
```

```{r}
rectified %>% dim
rectified %>% filter(cate == "서비스업") %>% dim
rectified %>% filter(cate == "유통업") %>% dim
```

```{r}
write.csv(rectified, "업종_서비스유통.csv")
# write.csv(rectified, "업종_서비스.csv")
# write.csv(rectified, "업종_유통.csv")
```

---

3. 변수로 취급할 것들

- 날짜(transacted_date일, 월), 시간(transacted_time), 분류(cate), 판매량(amount)

```{r eval = F}
plot(df$type_of_business, df$amount)
corrplot(df$type_of_business, df$amount)
```

3-1. 판매량을 변수로 이용해볼 순 없는 걸까.

```{r}
rectified %>%
  # filter(between(transacted_date, unique(rectified$transacted_date)[1], unique(rectified$transacted_date)[185])) %>%
  group_by(cate, transacted_date) %>%
  summarise(sumV = sum(amount)) %>%
  # summarise(minV = min(amount), medianV = median(amount), meanV = mean(amount), maxV = max(amount)) %>%
  ggplot(aes(color = cate)) +
  geom_point(aes(x = transacted_date, y = sumV))
```

뭔가 이상한데, 원 데이터는 어떻게 보일까.

```{r}
df %>%
  ggplot(aes(x = type_of_business, y = amount)) +
  geom_boxplot()
```

- 결론: 판매량이 보통 이용하는 돈의 단위가 아니라서 안 될 것 같음.

<br />

3-2. 그렇다면 카드를 긁은 시간은 어떨까.

- 우리가 만든 카테고리로 '전체'를 먼저 만들자.

```{r}
categories = c("교육", "미용", "생활용품 소매업", "서비스업", "오락 및 여가", "유통업", "음식소매업", "의료", "의류", "일반 음식점", "전자기기", "휴게 음식점")
total = c()
for (i in categories) {
  part = read.csv(file.choose(), header = T)
  if (colnames(part)[1] != "store_id") {
    part = part[, colnames(part)[-c(1)]]
  }
  if (length(colnames(part)) < 10) {
    part["cate"] = i
  }
  coln = colnames(part)
  if (tail(coln, 1) != "cate") {
    colnames(part)[length(coln)] = "cate"
  }
  total = bind_rows(total, part)
}
total %>% dim
total %>% head
write.csv(total, "total.csv")
```

```{r eval = F}
total = read.csv(file.choose(), header = T)
total %>% select(transacted_date) %>% unique %>% sort %>% head
```

3-2-1. 전체에서 시각적으로 확인해보자.

```{r}
total %>%
  group_by(cate, transacted_time) %>%
  summarise(times = length(amount)) %>%
  mutate(part = as.POSIXct(transacted_time, format = "%H:%M")) %>%
  ggplot(aes(color = cate)) +
  geom_point(aes(x = part, y = times)) +
  scale_x_datetime(date_labels = "%H:%M") +
  # geom_vline(xintercept = as.numeric(dt_val$dt[3]), color = "red", linetype = 2)
  labs(x = NULL, y = NULL, color = NULL)
```

- 역시 코딩으로 저장이 최고다.

```{r eval = F}
name = paste("total_time.jpg", sep = "")
jpeg(name, width = 1200, height = 600)
plotting = total %>%
  group_by(cate, transacted_time) %>%
  summarise(times = length(amount)) %>%
  mutate(part = as.POSIXct(transacted_time, format = "%H:%M")) %>%
  ggplot(aes(color = cate)) +
  geom_point(aes(x = part, y = times)) +
  scale_x_datetime(date_labels = "%H:%M") +
  # geom_vline(xintercept = as.numeric(dt_val$dt[3]), color = "red", linetype = 2)
  labs(x = NULL, y = NULL, color = NULL)
print(plotting)
dev.off()
```

결론: 역시 판매량은 쓸 수 없을 것 같다.

<br />

3-2-2. category별로 할 순 없는 걸까.

```{r include = F}
total %>%
  group_by(cate, transacted_time) %>%
  summarise(times = length(amount)) %>%
  mutate(part = as.POSIXct(transacted_time, format = "%H:%M")) %>%
  ggplot(aes(color = cate)) +
  facet_grid(cate ~ .) +
  geom_point(aes(x = part, y = times)) +
  scale_x_datetime(date_labels = "%H:%M") +
  guides(color = F)
```

3-3. custom 카테고리 X 요일의 조합은 어떨까.

```{r}
total %>%
  mutate(part = factor(wday(transacted_date))) %>%
  group_by(cate, part) %>%
  summarise(times = length(amount)) %>%
  ggplot(aes(color = cate)) +
  geom_line(aes(x = part, y = times, group = cate)) +
  scale_x_discrete(label = c("일", "월", "화", "수", "목", "금", "토"))
```

<br /><br /><br />

---

## Test data를 이용해 확인해보자.

<!--
4. test 데이터를 store_id 기준, 시간 순으로 판매횟수를 시각화 해보자.

```{r}
test_tt = read.csv(file.choose(), header = T, stringsAsFactors = F)
test_tt %>% head
```

맡은 구역은 40 ~ 79

```{r}
part = test_tt %>% filter(between(store_id, 40, 79))
part %>% head
part %>% select(time) %>% unlist %>% sort %>% head
```

test 데이터에는 초가 달려있다.

```{r}
part %>%
  group_by(store_id, time) %>%
  summarise(times = length(amount)) %>%
  .["times"] %>% summary

part %>%
  mutate(part = format(as.POSIXct(time, format = "%H:%M:%S"), "%H:%M")) %>%
  group_by(store_id, part) %>%
  summarise(times = length(amount)) %>%
  .["times"] %>% summary
```

없애자.

```{r}
final = part %>%
  mutate(part = format(as.POSIXct(time, format = "%H:%M:%S"), "%H:%M")) %>%
  group_by(store_id, part) %>%
  summarise(times = length(amount))
```

4-1. 통합 (40 - 79)

```{r}
final %>%
  ggplot(aes(color = factor(store_id))) +
  geom_point(aes(x = part, y = times)) +
  geom_vline(xintercept = c("12:00", "15:00", "18:00", "21:00")) +
  labs(color = NULL)
```

4-2. 10개씩 (40-49, 50-59, 60-69, 70-79)

```{r}
final %>%
  ggplot(aes(color = factor(store_id %% 10))) +
  facet_grid(rows = vars(store_id %/% 10)) +
  geom_point(aes(x = part, y = times)) +
  geom_vline(xintercept = c("12:00", "15:00", "18:00", "21:00")) +
  labs(color = NULL)
```

```{r eval = F}
for (i in 4:7) {
  start = i * 10
  finish = start + 9

  name = paste("test_point", start, "-", finish, ".jpg", sep = "")
  # name = paste("test_part_line_", start, "-", finish, ".jpg", sep = "")
  jpeg(name, width = 800, height = 600)
  plotting = final %>%
    filter(between(store_id, start, finish)) %>%
    ggplot(aes(color = factor(store_id))) +
    geom_point(aes(x = part, y = times)) +
    # geom_line(aes(x = part, y = times, group = store_id %% 10)) +
    geom_vline(xintercept = "12:00") + #, color = "orange") +
    geom_vline(xintercept = "15:00") + #, color = "green") +
    geom_vline(xintercept = "18:00") + #, color = "blue") +
    geom_vline(xintercept = "21:00") + #, color = "purple") +
    labs(color = NULL)
  print(plotting)
  dev.off()
}
```

번외: 선으로 표현

```{r}
final %>%
  ggplot(aes(color = factor(store_id %% 10))) +
  facet_grid(rows = vars(store_id %/% 10)) +
  geom_line(aes(x = part, y = times, group = store_id %% 10)) +
  geom_vline(xintercept = c("9:00", "12:00", "15:00", "18:00", "21:00")) +
  labs(x = NULL, y = NULL, color = NULL)
```

번외: colormap 이용

```{r eval = F}
library(RColorBrewer)
final %>%
  ggplot(aes(color = factor(store_id %% 10))) +
  facet_grid(rows = vars(store_id %/% 10)) +
  geom_point(aes(x = part, y = times)) +
  scale_color_brewer(palette = "Paired") + 
  geom_vline(xintercept = c("3:00", "6:00", "9:00", "12:00", "15:00", "18:00", "21:00")) +
  # ggtitle("40~49, 50~59, 60~69, 70~79") +
  labs(color = NULL)
```

4-3. 5개씩 (40-44, 45-49, 50-54, 55-59, 60-64, 65-69, 70-74, 75-79)

```{r}
final %>%
  ggplot(aes(color = factor(store_id %% 5))) +
  facet_grid(rows = vars(store_id %/% 5)) +
  geom_point(aes(x = part, y = times)) +
  geom_vline(xintercept = c("12:00", "15:00", "18:00", "21:00")) +
  labs(color = NULL)
```

한 블럭을 통해 확인해보자.

```{r}
i = 4
```

```{r}
i = i + 1
final %>%
  filter(between(store_id, i * 10, i * 10 + 9)) %>%
  ggplot(aes(color = factor(store_id %% 5))) +
  facet_grid(rows = vars(store_id %/% 5)) +
  geom_point(aes(x = part, y = times)) +
  geom_vline(xintercept = c("12:00", "15:00", "18:00", "21:00")) +
  ylim(0, 85) +
  labs(color = NULL)
```

4-4. 1개씩 확인해보자.

```{r}
nn = 40
```

```{r}
nn = nn + 1
final %>% filter(store_id == nn) %>%
  ggplot(aes(color = factor(store_id))) +
  geom_point(aes(x = part, y = times)) +
  geom_vline(xintercept = "12:00") + #, color = "black") +
  geom_vline(xintercept = "15:00") + #, color = "grey15") +
  geom_vline(xintercept = "18:00") + #, color = "yellow") +
  geom_vline(xintercept = "21:00") #+ #, color = "orange") # +
  # scale_x_continuous(breaks = c()) + 
  # scale_x_datetime(breaks = date_breaks("1 min"), labels = date_format("%H:%M"))
```

내꺼 부분 하나씩 확인

```{r}
test_tt = read.csv(file.choose(), header = T, stringsAsFactors = F)
part = test_tt %>% filter(between(store_id, 40, 79))
final = part %>%
  mutate(part = as.POSIXct(time, format = "%H:%M")) %>%
  group_by(store_id, part) %>%
  summarise(times = length(amount))
```

```{r}
for (i in 40:79) {
  name = paste(i, "point.jpg", sep = "")
  jpeg(name, width = 1200, height = 600)
  plotting = final %>%
    filter(store_id == i) %>%
    ggplot() +
    geom_point(aes(x = part, y = times)) +
    ggtitle(i) + 
    scale_x_datetime(date_labels = "%H:%M")
  print(plotting)
  dev.off()
}
```

```{r}
test_tt %>%
  group_by(store_id) %>%
  summarise(times = length(amount))

test_tt %>%
  mutate(part = as.POSIXct(time, format = "%H:%M")) %>%
  group_by(store_id, part = cut(part, "10 min")) %>%
  summarise(times = length(amount))
  ggplot(aes(color = factor(store_id))) +
  geom_point(aes(x = part, y = times)) +
  guides(color = F)
```

저장에 용이하도록

```{r}
name = paste("test.jpg", sep = "")
jpeg(name, width = 1200, height = 600)
plotting = test_tt %>%
  mutate(part = as.POSIXct(time, format = "%H")) %>%
  group_by(store_id, part) %>%
  summarise(times = length(amount)) %>%
  ggplot(aes(color = factor(store_id))) +
  geom_point(aes(x = part, y = times)) +
  scale_x_datetime(date_labels = "%H:%M") + 
  guides(color = F)
  print(plotting)
  dev.off()
```

```{r}
part = test_tt %>% filter(store_id == present)
final = part %>%
  mutate(part = as.POSIXct(time, format = "%H:%M")) %>%
  group_by(store_id, part) %>%
  summarise(times = length(amount))

name = paste(present, "point.jpg", sep = "")
jpeg(name, width = 1200, height = 600)
plotting = final %>%
    ggplot() +
    geom_point(aes(x = part, y = times)) +
    ggtitle(i) + 
    scale_x_datetime(date_labels = "%H:%M")
  print(plotting)
  dev.off()
```

두번째 저장을 위한

```{r}
name = paste("total_a_week_line2.jpg", sep = "")
jpeg(name, width = 1200, height = 600)
plotting = total %>%
  filter(cate != "일반 음식점") %>%
  mutate(part = wday(transacted_date)) %>%
  group_by(cate, part) %>%
  summarise(times = length(amount)) %>%
  ggplot(aes(color = cate)) +
  geom_line(aes(x = part, y = times)) +
  # scale_x_discrete(label = c("일", "월", "화", "수", "목", "금", "토")) +
  labs(y = NULL, color = NULL)
print(plotting)
dev.off()
```
-->

4. test 처음부터

- 전체 판매 건수가 300 이하인 store_id는 판별 불가로 제외하기로 하였다.
- 횟수가 적어서 패턴을 보기 어려우니까 10분 단위로 묶자.
- 알아보기 쉽게 시간으로, 시간선 그리자.

데이터 불러오기

```{r}
test_tt = read.csv(file.choose(), header = T, stringsAsFactors = F)
test_tt %>% head
```

store_id 당 전체 판매 건수 확인 (맡은 거 확인)

```{r}
removed = test_tt %>%
  group_by(store_id) %>%
  summarise(times = length(amount)) %>%
  filter(times >= 300) %>%
  select(store_id)
colnames(removed) = NULL
removed = removed %>% unlist
removed[65:96]
```

10분 단위로 판매 횟수를 묶기

```{r}
# test_part %>% head
test_part = test_tt %>%
  mutate(part = as.POSIXct(time, format = "%H:%M")) %>%
  group_by(store_id, part = cut(part, "10 min")) %>% 
  summarise(times = length(amount))
```

저장) boxplot 확인하고자

```{r}
for (i in 34) {
  name = paste(i, ".jpg", sep = "")
  jpeg(name, width = 600, height = 600)
  
  plotting = test_part %>%
    filter(store_id == i) %>%
    mutate(part = as.POSIXct(part)) %>%
    ggplot(aes(x = part, y = times, color = factor(store_id))) +
    geom_boxplot(outlier.color = "lightblue") +
    geom_text(aes(label = times), hjust = -0.3)
  
  print(plotting)
  dev.off()
}
```

저장) 내 파트 저장하려고.

```{r}
limit = as.POSIXct(c("00:00", "24:00"), format = "%H:%M")
time_line = as.POSIXct(c("9:00", "12:00", "15:00", "18:00", "21:00", "24:00"), format = "%H:%M")

for (i in removed[65:96]) {
  name = paste(i, ".jpg", sep = "")
  jpeg(name, width = 1200, height = 600)
  
  plotting = test_part %>%
    filter(store_id == i) %>%
    mutate(part = as.POSIXct(part)) %>%
    # 그림 그리기 시작
    ggplot(aes(x = part, y = times, color = factor(store_id))) +
    geom_point(size = 5) +
    # 축 설정 및 선 그리기
    scale_x_datetime(date_labels = "%H:%M", limits = limit) +
    geom_vline(xintercept = time_line, linetype = 2) +
    # label, legend, title
    labs(x = NULL, y = NULL, color = NULL) +
    ggtitle(i) +
    # theme으로 legend 설정하기 싫은데, 다른 방법은 힘들다.
    theme(legend.position = "none")
  
  print(plotting)
  dev.off()
}
```

저장) 뭉텅이로 요일

```{r}
# removed[65:96]
for (i in 0:7) {
  start = 65 + i * 4
  finish = start + 3
  
  name = paste0(removed[start], "~", removed[finish], ".jpg")
  jpeg(name, width = 800, height = 600)
  
  plotting = test_tt %>%
    filter(store_id %in% removed[start:finish]) %>%
    mutate(part = factor(wday(date))) %>%
    group_by(store_id, part) %>%
    summarise(times = length(amount)) %>%
    ggplot(aes(color = factor(store_id))) +
    geom_line(aes(x = part, y = times, group = store_id)) +
    scale_x_discrete(label = c("일", "월", "화", "수", "목", "금", "토")) +
    labs(x = NULL, y = NULL, color = NULL)
  
  print(plotting)
  dev.off()
}
```

이러다 좀 더 포괄적인 저장 함수 구현에 대해 신경 쓸 것만 같은 그런 느낌

4-1. 다양하게 그림을 그려보자.

```{r}
total = read.csv(file.choose(), row.names = NULL, stringsAsFactors = F)
total %>% colnames
```

date: transacted_date, transacted_time
factor: (store_id, card_id,) card_company, installment_term(, region)
continous: amount
output: (type_of_business,) cate

```{r}
total %>%
  group_by(transacted_time, card_company, cate) %>%
  summarise(amount, refund = amount < 0) %>%
  ggplot(aes(x = card_company, y = amount, color = cate, shape = refund)) +
  geom_point() +
  labs(x = NULL, y = NULL, color = NULL)
```

```{r}
tt = total %>%
  group_by(cate) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(refund = refund / (times - refund))

# name = paste0("total_refund.jpg")
# jpeg(name, width = 800, height = 600)
# plotting = 
tt %>%
  ggplot(aes(x = cate, y = refund)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  labs(x = NULL, y = NULL, color = NULL) +
  geom_text(aes(label = round(refund, 5)))

# print(plotting)
# dev.off()
```

시간별로 amount(or 카드 긁은 횟수)를 보는데, card_company는 모양으로, cate는 색으로?

```{r}
total %>%
  group_by(card_company, cate, transacted_time) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(part = as.POSIXct(transacted_time, format = "%H:%M")) %>%
  ggplot(aes(x = part, y = times, color = cate, shape = card_company)) +
  geom_point() +
  scale_x_datetime(date_labels = "%H:%M") + 
  labs(x = NULL, y = NULL, color = NULL)
```

카드 회사별로 소비자의 소비 category를 시각화한 건데, 실패

업종 -> (어떤 과정에 쓰일 특징 증거들)
store_id -> (어떤 과정) -> 업종

```{r}
total %>%
  group_by(card_company, cate) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  ggplot(aes(x = card_company, y = times, fill = cate)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(x = NULL, y = NULL, color = NULL)
```

4-2. 유형 확인

```{r}
test = read.csv(file.choose(), row.names = NULL, stringsAsFactors = F)
test %>% colnames
```

```{r}
test_part = test %>%
  mutate(part = as.POSIXct(time, format = "%H:%M")) %>%
  group_by(store_id, part = cut(part, "10 min")) %>% 
  summarise(times = length(amount)) %>% head
```

4-3. 나눈 유형에 업종을 붙여보자.

```{r}
betweens1 = c(4, 6, 8, 13, 16, 19, 22, 23, 31, 35, 38, 40, 54, 55, 56, 58, 62, 72, 74, 81, 85, 101, 117, 118, 120, 135, 137, 139, 143, 148, 149, 154, 155, 156, 158, 163, 177, 179, 180, 182, 184, 190, 193, 194, 78, 106, 151, 166, 176)

betweens2 = c(79, 100, 116, 142, 152, 160)

betweens3 = c(0, 2, 3, 12, 14, 15, 17, 18, 24, 30, 37, 39, 42, 48, 49, 50, 53, 59, 60, 61, 65, 66, 68, 73, 86, 90, 91, 96, 98, 99, 102, 104, 109, 112, 115, 119, 123, 125, 128, 132, 134, 159, 161, 169, 174, 178, 185, 186, 188, 191, 195, 10, 20, 25, 45) # 88

betweens4 = c(172)
betweens5 = c(11, 69, 80, 82, 87, 103, 105, 107, 110, 114, 122, 131, 140, 144, 153, 162, 165, 167)
betweens6 = c(34)
```

```{r}
name = paste0("betweens3_without_88.jpg")
jpeg(name, width = 800, height = 600)

plotting = test_part %>%
  filter(store_id %in% betweens3) %>%
  mutate(part = as.POSIXct(part)) %>%
  ggplot(aes(color = factor(store_id))) +
  geom_point(aes(x = part, y = times)) +
  scale_x_datetime(date_labels = "%H:%M") +
  ggtitle("3유형") +
  labs(x = NULL, y = NULL, color = NULL) #+
  #guides(color = F)
  
print(plotting)
dev.off()
```

4-4. 모비율 검정

```{r}
temp = read.xlsx(file.choose(), stringsAsFactors = F, header = T, sheetIndex = 2, encoding = "UTF-8")
temp %>% head
```

```{r eval = F}
tt = total %>%
  group_by(cate) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(refund = refund / (times - refund))
colnames(tt) = NULL
tt %>% .[1, 2:3]
tt %>% .[, 1] %>% unlist
tt
```

4-4-1.여기서부터는 유형별로 다릅니다.

```{r}
x_part = 0; n = 0
for (i in c(6, 9)) {
  x_part = x_part + unlist(tt[i, 2]) * unlist(tt[i, 3])
  n = n + unlist(tt[i, 2])
}
x_part; n
```

나눈 유형별로 비율 따오자.

```{r}
tt1 = test %>%
  filter(store_id %in% betweens1) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(refund = refund / (times - refund))
colnames(tt1) = NULL
tt1
```

```{r}
prop2 = unlist(tt1[2])
n = c(n, unlist(tt1[1]))
x = c(x_part, prop2 * unlist(tt1[1]))
prop.test(x = x, n = n, alternative = c("two.sided"), conf.level = 0.95)
```

<!-- 출처: https://rfriend.tistory.com/129 [R, Python 분석과 프로그래밍의 친구 (by R Friend)]
prop <- c(0.33, 0.41) # proportion of events
n <- c(500, 600) # number of trials
x <- prop*n # number of events

prop.test(x = x, # number of events
+           n = n, # number of trials
+           alternative = c("two.sided"), # alternative = c("two.sided", "less", "greater")
+           conf.level = 0.95) # confidence level (= 1- significance level 'alpha') -->

4-4-2. 표로 이쁘게 뽑아보자.

```{r}
tt = total %>%
  group_by(cate) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
colnames(tt) = NULL
tt
```

```{r}
tt1[1, ] = test %>%
  filter(store_id %in% betweens1) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1[2, ] = test %>%
  filter(store_id %in% betweens2) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1[3, ] = test %>%
  filter(store_id %in% betweens3) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1[4, ] = test %>%
  filter(store_id %in% betweens4) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1[5, ] = test %>%
  filter(store_id %in% betweens5) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
# write.csv(tt, "total_refund_ratio.csv")
# write.csv(tt1, "test_refund_ration.csv")
tt1
```

6, 9 / 11 / 4 / 8, 3 / 2 / 1

```{r}
x = 0
n = 0
for (i in c(6, 9)) {
  x = x + unlist(tt[i, 3])
  n = n + unlist(tt[i, 2])
}

prop.test(x = c(unlist(tt1[1, 2]), x), n = c(unlist(tt1[1, 1]), n),
          alternative = c("two.sided"), conf.level = 0.95)
```

```{r eval = F}
p = x / n
p
prop.test(x = unlist(tt1[1, 2]), n = unlist(tt1[1, 1]), p = p,
          alternative = c("two.sided"), conf.level = 0.95)
```

---

4-4-3. 다시 확인

test data 정제

```{r}
# test data
test = read.csv(file.choose(), stringsAsFactors = F, header = T)
test %>% head
```

```{r}
dd = test %>% filter(업종번호 == 6) %>% select(store_id) %>% unique
colnames(dd) = NULL
dd %>% unlist
```

```{r}
# test %>% filter(업종번호 == 6) %>% select(store_id) %>% unique %>% as.list %>% .[[1]] %>% length

tt1[1, ] = test %>%
  filter(업종번호 == 1) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1[2, ] = test %>%
  filter(업종번호 == 2) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1[3, ] = test %>%
  filter(업종번호 == 3) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1[4, ] = test %>%
  filter(업종번호 == 4) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1[5, ] = test %>%
  filter(업종번호 == 5) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1[6, ] = test %>%
  filter(업종번호 == 6) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt1
```

train data 정제

```{r}
total = read.csv(file.choose(), stringsAsFactors = F, header = T)
total %>% head
total %>% colnames
```
```{r}
total %>% select(cate) %>% unique %>% .[1] %>% unlist
```

```{r}
tt = total %>%
  filter(cate %in% c("일반 음식점", "음식소매업")) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
colnames(tt) = NULL

tt[2, ] = total %>%
  filter(cate %in% c("휴게 음식점")) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt[3, ] = total %>%
  filter(cate %in% c("오락 및 여가")) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt[4, ] = total %>%
  filter(cate %in% c("의류", "서비스업")) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt[5, ] = total %>%
  filter(cate %in% c("미용")) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt[6, ] = total %>%
  filter(cate %in% c("교육")) %>%
  summarise(times = length(amount), refund = sum(amount < 0)) %>%
  mutate(times = times - refund)
tt
```

```{r}
tt1
```

- 6, 9 / 11 / 4 / 8, 3 / 2 / 1

하나씩 확인해보자.

```{r}
index = 5
x = c(unlist(tt[index, 2]), unlist(tt1[index, 2]))
n = c(unlist(tt[index, 1]), unlist(tt1[index, 1]))
prop.test(x = x, n = n, alternative = c("two.sided"), conf.level = 0.95)
```

```{r}
to_csv = c()
for (index in 1:6) {
  x = c(unlist(tt[index, 2]), unlist(tt1[index, 2]))
  n = c(unlist(tt[index, 1]), unlist(tt1[index, 1]))
  # prop.test(x = x, n = n, alternative = c("two.sided"), conf.level = 0.95)
  summ = prop.test(x = x, n = n, alternative = c("two.sided"), conf.level = 0.95)
  temp = bind_cols(index, x[1], n[1], x[2], n[2], summ$conf.int[1], summ$conf.int[2], summ$p.value)
  to_csv = bind_rows(to_csv, temp)
}
colnames(to_csv) = c("index", "total_events", "total_times", "test_events", "test_times", "conf_low", "conf_high", "p_value")
to_csv
```
```{r}
write.csv(to_csv, "prop_hypothesis.csv", sep = ",")
```

---

#### 추가) 그림 뽑아주려고.

특정 카테고리 그림 뽑기

```{r}
# name = paste("total_미용.jpg", sep = "")
# jpeg(name, width = 1000, height = 600)
# plotting = 
total %>%
  filter(cate %in% c("의류")) %>%
  group_by(transacted_time) %>% 
  summarise(times = length(amount)) %>%
  mutate(part = as.POSIXct(transacted_time, format = "%H:%M")) %>%
  ggplot(aes(color = "red")) +
  geom_point(aes(x = part, y = times)) +
  scale_x_datetime(date_labels = "%H:%M") +
  scale_y_continuous(limits = c(0, 1000)) +
  labs(x = NULL, y = NULL, color = NULL) +
  guides(color = F)
# print(plotting)
# dev.off()
```

업종번호에 따라 다르게

```{r}
name = paste0("test_custom2.jpg")
jpeg(name, width = 1200, height = 600)
plotting = test %>%
  group_by(업종번호, time) %>%
  summarise(times = length(amount)) %>%
  filter(업종번호 != "") %>%
  mutate(part = as.POSIXct(time, format = "%H:%M")) %>%
  ggplot(aes(color = factor(업종번호))) +
  geom_point(aes(x = part, y = times)) +
  scale_x_datetime(date_labels = "%H:%M") +
  labs(x = NULL, y = NULL, color = NULL)
print(plotting)
dev.off()
```

특정 store_id 그림

```{r}
for (present in c(34)) {
  name = paste0(present, "_test.jpg")
  jpeg(name, width = 800, height = 600)
  
  plotting = test %>%
    filter(store_id == present) %>%
    group_by(time) %>%
    summarise(times = length(amount)) %>%
    mutate(part = as.POSIXct(time, format = "%H:%M")) %>%
    ggplot() +
    geom_point(aes(x = part, y = times), color = "forestgreen") +
    scale_x_datetime(date_labels = "%H:%M") +
    labs(x = NULL, y = NULL, color = NULL)

  print(plotting)
  dev.off()
}
```