Data visualization/데이터시각화(R)

한국의 인기 YouTube 동영상 통계

뉴욕킴 2023. 11. 23. 22:33

# 패키지 로드
library(sf)
library(here)
library(skimr)
library(janitor)
library(maps)
library(plotly)
library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)

# 데이터 불러오기
data_to_clean <- read.csv('C:/Users/eznay/Downloads/Global YouTube Statistics.csv')

# 데이터 현황 파악
head(data_to_clean)
glimpse(data_to_clean)
str(data_to_clean)
names(data_to_clean)

# 데이터 정제
data_to_clean <- data_to_clean %>%
  rename_with(tolower) %>%
  clean_names()

# 데이터 중복 확인
duplicated_rows <- sum(duplicated(data_to_clean))
duplicated_rows

# 결측치 확인
na_values <- colSums(is.na(data_to_clean))
na_values

# 데이터 정제
na_rows <- data_to_clean[!complete.cases(data_to_clean), ]
head(na_rows)

data <- data_to_clean[complete.cases(data_to_clean$latitude & data_to_clean$created_year & data_to_clean$channel_type_rank), ]

na_rows_full <- colSums(is.na(data))
na_rows_full

# 결측치 채우기
set_date <- '2023-11-21'

data$date <- ymd(sprintf('%04d%03s%02d', data$created_year, data$created_month, data$created_date))

data$subscribers_for_last_30_days[is.na(data$subscribers_for_last_30_days)] <- as.integer(round((data$subscribers / (interval(data$date, set_date) %/% months(1)))))

replacement_values <- as.integer(round((data$video_views / (interval(data$date, set_date) %/% months(1)))))

data$video_views_for_the_last_30_days[is.na(data$video_views_for_the_last_30_days)] <- replacement_values

na_rows_full <- colSums(is.na(data))
na_rows_full

# 구독자가 가장 많은 유튜버 Top 10 시각화
top_subscribers <- data %>%
  group_by(channel_title) %>%
  summarise(total_subscribers = sum(subscribers)) %>%
  top_n(10, total_subscribers) %>%
  arrange(desc(total_subscribers))

ggplot(top_subscribers, aes(x = reorder(channel_title, total_subscribers), y = total_subscribers)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Top 10 YouTube Channels with the Most Subscribers", x = "Channel Title", y = "Total Subscribers") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 가장 많이 본 상위 10위 시각화
top_views <- data %>%
  arrange(desc(video_views)) %>%
  slice(1:10)

ggplot(top_views, aes(x = reorder(channel_title, video_views), y = video_views)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  labs(title = "Top 10 YouTube Videos with the Most Views", x = "Channel Title", y = "Total Views") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 구독자 vs 조회수 관계 시각화
ggplot(data, aes(x = subscribers, y = video_views)) +
  geom_point() +
  labs(title = "Relationship between Subscribers and Views", x = "Subscribers", y = "Views") +
  theme(legend.position = "none")

# 카테고리별 유튜버 수 시각화
category_counts <- data %>%
  group_by(category_id) %>%
  summarise(count = n())

ggplot(category_counts, aes(x = category_id, y = count)) +
  geom_bar(stat = "identity", fill = "orange") +
  labs(title = "Number of YouTube Channels by Category", x = "Category ID", y = "Count")

# 채널 유형별 조회수 시각화
channel_views <- data %>%
  group_by(channel_type) %>%
  summarise(total_views = sum(video_views)) %>%
  arrange(desc(total_views))

ggplot(channel_views, aes(x = channel_type, y = total_views, fill = channel_type)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Views by Channel Type", x = "Channel Type", y = "Total Views") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 채널 유형별 구독자 수 시각화
channel_subscribers <- data %>%
  group_by(channel_type) %>%
  summarise(total_subscribers = sum(subscribers)) %>%
  arrange(desc(total_subscribers))

ggplot(channel_subscribers, aes(x = channel_type, y = total_subscribers, fill = channel_type)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Subscribers by Channel Type", x = "Channel Type", y = "Total Subscribers") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 채널 유형별 업로드 수 시각화
channel_uploads <- data %>%
  group_by(channel_type) %>%
  summarise(total_uploads = n()) %>%
  arrange(desc(total_uploads))

ggplot(channel_uploads, aes(x = channel_type, y = total_uploads, fill = channel_type)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Uploads by Channel Type", x = "Channel Type", y = "Total Uploads") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 연간 수익이 가장 높은 상위 20개 YouTube 채널 시각화
top_earning <- data %>%
  arrange(desc(highest_monthly_earnings)) %>%
  slice(1:20)

ggplot(top_earning, aes(x = reorder(channel_title, highest_monthly_earnings), y = highest_monthly_earnings)) +
  geom_bar(stat = "identity", fill = "pink") +
  labs(title = "Top 20 YouTube Channels with the Highest Annual Earnings", x = "Channel Title", y = "Highest Monthly Earnings") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 구독자 수, 업로드 수, 조회 수, 연간 수입 간의 상관관계 시각화
correlation_data <- data %>%
  select(subscribers, video_views, uploads, highest_monthly_earnings)

correlation_matrix <- cor(correlation_data)

plot_ly(z = correlation_matrix,
        x = colnames(correlation_matrix),
        y = colnames(correlation_matrix),
        type = "heatmap",
        colorscale = "Viridis") %>%
  layout(title = "Correlation Matrix",
         xaxis = list(title = "Variables"),
         yaxis = list(title = "Variables"))

# 구독자 수, 채널 유형, 국가, 채널 생성일 등의 여러 요인 중 조회수에 가장 큰 영향을 끼치는 요인 회귀분석
regression_data <- data %>%
  select(subscribers, channel_type, country, date)

regression_model <- lm(video_views ~ subscribers + channel_type + country + date, data = regression_data)
summary(regression_model)

# 비슷한 카테고리의 영상이 인기가 많은 국가를 클러스터링
category_country <- data %>%
  group_by(category_id, country) %>%
  summarise(total_views = sum(video_views)) %>%
  arrange(desc(total_views))

# lowest_monthly_earnings, highest_monthly_earnings 변수 이용해서 연간 수입 변화의 추세 시각화
earnings_trend <- data %>%
  select(date, lowest_monthly_earnings, highest_monthly_earnings)

ggplot(earnings_trend, aes(x = date)) +
  geom_line(aes(y = lowest_monthly_earnings, color = "Lowest Monthly Earnings")) +
  geom_line(aes(y = highest_monthly_earnings, color = "Highest Monthly Earnings")) +
  labs(title = "Annual Earnings Trend", x = "Date", y = "Monthly Earnings") +
  scale_color_manual(values = c("Lowest Monthly Earnings" = "blue", "Highest Monthly Earnings" = "red"))

# Population, Unemployment.rate, Urban_population을 이용하여 각 변수가 조회수/채널 수익과 어떤 관계가 있는지 확인
correlation_data_2 <- data %>%
  select(video_views, highest_monthly