1. 패키지 로드
library(sf)
library(here)
library(skimr)
library(janitor)
library(maps)
library(plotly)
library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
2. 데이터 불러오기
data_to_clean <- read.csv('C:/Users/eznay/Downloads/Global YouTube Statistics.csv')
3. 데이터 현황 파악
head(data_to_clean)
glimpse(data_to_clean)
Rows: 995
Columns: 28
$ rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …
$ youtuber <chr> "T-Series", "YouTube Movies", "MrBeast"…
$ subscribers <int> 245000000, 170000000, 166000000, 162000…
$ video_views <dbl> 228000000000, 0, 28368841870, 164000000…
$ category <chr> "Music", "Film & Animation", "Entertain…
$ title <chr> "T-Series", "youtubemovies", "MrBeast",…
$ uploads <int> 20082, 1, 741, 966, 116536, 0, 1111, 47…
$ country <chr> "India", "United States", "United State…
$ abbreviation <chr> "IN", "US", "US", "US", "IN", "nan", "U…
$ channel_type <chr> "Music", "Games", "Entertainment", "Edu…
$ video_views_rank <int> 1, 4055159, 48, 2, 3, 4057944, 5, 44, 6…
$ country_rank <dbl> 1, 7670, 1, 2, 2, NaN, 3, 1, 5, 5, 3, 6…
$ channel_type_rank <dbl> 1, 7423, 1, 1, 2, NaN, 3, 4, 25, 6, 2, …
$ video_views_for_the_last_30_days <dbl> 2258000000, 12, 1348000000, 1975000000,…
$ lowest_monthly_earnings <dbl> 564600, 0, 337000, 493800, 455900, 0, 1…
$ highest_monthly_earnings <dbl> 9.000e+06, 5.000e-02, 5.400e+06, 7.900e…
$ lowest_yearly_earnings <dbl> 6.800e+06, 4.000e-02, 4.000e+06, 5.900e…
$ highest_yearly_earnings <dbl> 1.084e+08, 5.800e-01, 6.470e+07, 9.480e…
$ subscribers_for_last_30_days <dbl> 2000000, NaN, 8000000, 1000000, 1000000…
$ created_year <dbl> 2006, 2006, 2012, 2006, 2006, 2013, 201…
$ created_month <chr> "Mar", "Mar", "Feb", "Sep", "Sep", "Sep…
$ created_date <dbl> 13, 5, 20, 1, 20, 24, 12, 29, 14, 23, 1…
$ gross_tertiary_education_enrollment <dbl> 28.1, 88.2, 88.2, 88.2, 28.1, NaN, 88.2…
$ population <dbl> 1366417754, 328239523, 328239523, 32823…
$ unemployment_rate <dbl> 5.36, 14.70, 14.70, 14.70, 5.36, NaN, 1…
$ urban_population <dbl> 471031528, 270663028, 270663028, 270663…
$ latitude <dbl> 20.59368, 37.09024, 37.09024, 37.09024,…
$ longitude <dbl> 78.962880, -95.712891, -95.712891, -9
str(data_to_clean)
'data.frame': 995 obs. of 28 variables:
$ rank : int 1 2 3 4 5 6 7 8 9 10 ...
$ youtuber : chr "T-Series" "YouTube Movies" "MrBeast" "Cocomelon - Nursery Rhymes" ...
$ subscribers : int 245000000 170000000 166000000 162000000 159000000 119000000 112000000 111000000 106000000 98900000 ...
$ video_views : num 2.28e+11 0.00 2.84e+10 1.64e+11 1.48e+11 ...
$ category : chr "Music" "Film & Animation" "Entertainment" "Education" ...
$ title : chr "T-Series" "youtubemovies" "MrBeast" "Cocomelon - Nursery Rhymes" ...
$ uploads : int 20082 1 741 966 116536 0 1111 4716 493 574 ...
$ country : chr "India" "United States" "United States" "United States" ...
$ abbreviation : chr "IN" "US" "US" "US" ...
$ channel_type : chr "Music" "Games" "Entertainment" "Education" ...
$ video_views_rank : int 1 4055159 48 2 3 4057944 5 44 630 8 ...
$ country_rank : num 1 7670 1 2 2 NaN 3 1 5 5 ...
$ channel_type_rank : num 1 7423 1 1 2 ...
$ video_views_for_the_last_30_days : num 2.26e+09 1.20e+01 1.35e+09 1.98e+09 1.82e+09 ...
$ lowest_monthly_earnings : num 564600 0 337000 493800 455900 ...
$ highest_monthly_earnings : num 9.0e+06 5.0e-02 5.4e+06 7.9e+06 7.3e+06 ...
$ lowest_yearly_earnings : num 6.8e+06 4.0e-02 4.0e+06 5.9e+06 5.5e+06 ...
$ highest_yearly_earnings : num 1.08e+08 5.80e-01 6.47e+07 9.48e+07 8.75e+07 ...
$ subscribers_for_last_30_days : num 2e+06 NaN 8e+06 1e+06 1e+06 NaN NaN NaN 1e+05 6e+05 ...
$ created_year : num 2006 2006 2012 2006 2006 ...
$ created_month : chr "Mar" "Mar" "Feb" "Sep" ...
$ created_date : num 13 5 20 1 20 24 12 29 14 23 ...
$ gross_tertiary_education_enrollment: num 28.1 88.2 88.2 88.2 28.1 NaN 88.2 63.2 81.9 88.2 ...
$ population : num 1.37e+09 3.28e+08 3.28e+08 3.28e+08 1.37e+09 ...
$ unemployment_rate : num 5.36 14.7 14.7 14.7 5.36 NaN 14.7 2.29 4.59 14.7 ...
$ urban_population : num 4.71e+08 2.71e+08 2.71e+08 2.71e+08 4.71e+08 ...
$ latitude : num 20.6 37.1 37.1 37.1 20.6 ...
$ longitude : num 79 -95.7 -95.7 -95.7 79 ...
names(data_to_clean)
names(data_to_clean)
[1] "rank" "youtuber"
[3] "subscribers" "video_views"
[5] "category" "title"
[7] "uploads" "country"
[9] "abbreviation" "channel_type"
[11] "video_views_rank" "country_rank"
[13] "channel_type_rank" "video_views_for_the_last_30_days"
[15] "lowest_monthly_earnings" "highest_monthly_earnings"
[17] "lowest_yearly_earnings" "highest_yearly_earnings"
[19] "subscribers_for_last_30_days" "created_year"
[21] "created_month" "created_date"
[23] "gross_tertiary_education_enrollment" "population"
[25] "unemployment_rate" "urban_population"
[27] "latitude" "longitude"
4. 데이터 정제
data_to_clean <- data_to_clean %>%
rename_with(tolower) %>%
clean_names()
5. 데이터 중복 확인
duplicated_rows <- sum(duplicated(data_to_clean))
duplicated_rows
[1] 0
6. 결측치 확인
na_values <- colSums(is.na(data_to_clean))
na_values
rank youtuber
0 0
subscribers video_views
0 0
category title
0 0
uploads country
0 0
abbreviation channel_type
0 0
video_views_rank country_rank
1 116
channel_type_rank video_views_for_the_last_30_days
33 56
lowest_monthly_earnings highest_monthly_earnings
0 0
lowest_yearly_earnings highest_yearly_earnings
0 0
subscribers_for_last_30_days created_year
337 5
created_month created_date
0 5
gross_tertiary_education_enrollment population
123 123
unemployment_rate urban_population
123 123
latitude longitude
123 123
7. 데이터 정제
na_rows <- data_to_clean[!complete.cases(data_to_clean), ]
head(na_rows)
data <- data_to_clean[complete.cases(data_to_clean$latitude & data_to_clean$created_year & data_to_clean$channel_type_rank), ]
na_rows_full <- colSums(is.na(data))
na_rows_full
rank youtuber
0 0
subscribers video_views
0 0
category title
0 0
uploads country
0 0
abbreviation channel_type
0 0
video_views_rank country_rank
0 0
channel_type_rank video_views_for_the_last_30_days
0 15
lowest_monthly_earnings highest_monthly_earnings
0 0
lowest_yearly_earnings highest_yearly_earnings
0 0
subscribers_for_last_30_days created_year
268 0
created_month created_date
0 0
gross_tertiary_education_enrollment population
0 0
unemployment_rate urban_population
0 0
latitude longitude
0 0
8. 결측치 채우기
set_date <- '2023-11-21'
data$date <- ymd(sprintf('%04d%03s%02d', data$created_year, data$created_month, data$created_date))
data$subscribers_for_last_30_days[is.na(data$subscribers_for_last_30_days)] <- as.integer(round((data$subscribers / (interval(data$date, set_date) %/% months(1)))))
replacement_values <- as.integer(round((data$video_views / (interval(data$date, set_date) %/% months(1)))))
data$video_views_for_the_last_30_days[is.na(data$video_views_for_the_last_30_days)] <- replacement_values
na_rows_full <- colSums(is.na(data))
na_rows_full
rank youtuber
0 0
subscribers video_views
0 0
category title
0 0
uploads country
0 0
abbreviation channel_type
0 0
video_views_rank country_rank
0 0
channel_type_rank video_views_for_the_last_30_days
0 0
lowest_monthly_earnings highest_monthly_earnings
0 0
lowest_yearly_earnings highest_yearly_earnings
0 0
subscribers_for_last_30_days created_year
0 0
created_month created_date
0 0
gross_tertiary_education_enrollment population
0 0
unemployment_rate urban_population
0 0
latitude longitude
0 0
date
0
9. 구독자가 가장 많은 유튜버 Top 10 시각화
data$title <- iconv(data$title, to = "UTF-8", sub = "byte")
data$title <- gsub("Kids Diana Show", "Kids_Diana_Show", data$title)
top_subscribers <- data %>%
group_by(title) %>%
summarise(total_subscribers = sum(subscribers)) %>%
top_n(10, total_subscribers) %>%
arrange(desc(total_subscribers))
ggplot(top_subscribers, aes(x = reorder(title, total_subscribers), y = total_subscribers)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 10 YouTube Channels with the Most Subscribers", x = "Channel Title", y = "Total Subscribers") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
10. 가장 많이 본 상위 10위 시각화
top_views <- data %>%
arrange(desc(video_views)) %>%
slice(1:10)
ggplot(top_views, aes(x = reorder(title, video_views), y = video_views)) +
geom_bar(stat = "identity", fill = "lightgreen") +
labs(title = "Top 10 YouTube Videos with the Most Views", x = "Channel Title", y = "Total Views") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
11. 구독자 vs 조회수 관계 시각화
ggplot(data, aes(x = subscribers, y = video_views)) +
geom_point() +
labs(title = "Relationship between Subscribers and Views", x = "Subscribers", y = "Views") +
theme(legend.position = "none")
12. 카테고리별 유튜버 수 시각화
category_counts <- data %>%
group_by(category) %>%
summarise(count = n())
ggplot(category_counts, aes(x = category, y = count)) +
geom_bar(stat = "identity", fill = "orange") +
labs(title = "Number of YouTube Channels by Category", x = "Category ID", y = "Count")
13. 채널 유형별 조회수 시각화
channel_views <- data %>%
group_by(channel_type) %>%
summarise(total_views = sum(video_views)) %>%
arrange(desc(total_views))
ggplot(channel_views, aes(x = channel_type, y = total_views, fill = channel_type)) +
geom_bar(stat = "identity") +
labs(title = "Total Views by Channel Type", x = "Channel Type", y = "Total Views") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
14. 채널 유형별 구독자 수 시각화
channel_subscribers <- data %>%
group_by(channel_type) %>%
summarise(total_subscribers = sum(subscribers)) %>%
arrange(desc(total_subscribers))
ggplot(channel_subscribers, aes(x = channel_type, y = total_subscribers, fill = channel_type)) +
geom_bar(stat = "identity") +
labs(title = "Total Subscribers by Channel Type", x = "Channel Type", y = "Total Subscribers") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
15. 채널 유형별 업로드 수 시각화
channel_uploads <- data %>%
group_by(channel_type) %>%
summarise(total_uploads = n()) %>%
arrange(desc(total_uploads))
ggplot(channel_uploads, aes(x = channel_type, y = total_uploads, fill = channel_type)) +
geom_bar(stat = "identity") +
labs(title = "Total Uploads by Channel Type", x = "Channel Type", y = "Total Uploads") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
16. 연간 수익이 가장 높은 상위 20개 YouTube 채널 시각화
top_earning <- data %>%
arrange(desc(highest_monthly_earnings)) %>%
slice(1:20)
ggplot(top_earning, aes(x = reorder(title, highest_monthly_earnings), y = highest_monthly_earnings)) +
geom_bar(stat = "identity", fill = "pink") +
labs(title = "Top 20 YouTube Channels with the Highest Annual Earnings", x = "Channel Title", y = "Highest Monthly Earnings") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
17. 구독자 수, 업로드 수, 조회 수, 연간 수입 간의 상관관계 시각화
correlation_data <- data %>%
select(subscribers, video_views, uploads, highest_monthly_earnings)
correlation_matrix <- cor(correlation_data)
plot_ly(z = correlation_matrix,
x = colnames(correlation_matrix),
y = colnames(correlation_matrix),
type = "heatmap",
colorscale = "Viridis") %>%
layout(title = "Correlation Matrix",
xaxis = list(title = "Variables"),
yaxis = list(title = "Variables"))
18. 구독자 수, 채널 유형, 국가, 채널 생성일 등의 여러 요인 중 조회수에 가장 큰 영향을 끼치는 요인 회귀분석
Call:
lm(formula = video_views ~ subscribers + channel_type + country +
date, data = regression_data)
Residuals:
Min 1Q Median 3Q Max
-1.035e+11 -3.188e+09 -1.097e+08 2.716e+09 7.172e+10
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.798e+09 1.111e+10 0.882 0.3780
subscribers 6.421e+02 1.809e+01 35.498 <2e-16 ***
channel_typeAutos -2.778e+09 8.909e+09 -0.312 0.7552
channel_typeComedy -6.386e+09 5.480e+09 -1.165 0.2442
channel_typeEducation -3.411e+09 5.465e+09 -0.624 0.5327
channel_typeEntertainment -5.721e+09 5.326e+09 -1.074 0.2831
channel_typeFilm -6.476e+09 5.525e+09 -1.172 0.2415
channel_typeGames -8.209e+09 5.407e+09 -1.518 0.1294
channel_typeHowto -8.384e+09 5.551e+09 -1.510 0.1313
channel_typeMusic -4.081e+09 5.351e+09 -0.763 0.4459
channel_typeNews -6.104e+09 5.625e+09 -1.085 0.2782
channel_typeNonprofit -1.406e+10 8.367e+09 -1.681 0.0932 .
channel_typePeople -5.544e+09 5.412e+09 -1.025 0.3059
channel_typeSports -6.205e+09 6.017e+09 -1.031 0.3027
channel_typeTech -1.164e+10 5.832e+09 -1.995 0.0464 *
countryArgentina -3.846e+09 9.540e+09 -0.403 0.6869
countryAustralia -5.661e+09 9.750e+09 -0.581 0.5617
countryBangladesh 9.564e+08 1.302e+10 0.073 0.9415
countryBarbados -8.862e+09 1.298e+10 -0.683 0.4950
countryBrazil -7.216e+09 9.281e+09 -0.778 0.4370
countryCanada -4.629e+09 9.485e+09 -0.488 0.6257
countryChile -9.771e+09 1.058e+10 -0.924 0.3558
countryChina -8.052e+09 1.309e+10 -0.615 0.5386
countryColombia -4.793e+09 9.620e+09 -0.498 0.6184
countryCuba -9.760e+09 1.302e+10 -0.750 0.4537
countryEcuador -7.368e+09 1.122e+10 -0.657 0.5117
countryEgypt -6.010e+09 1.129e+10 -0.532 0.5948
countryEl Salvador -1.939e+10 1.293e+10 -1.500 0.1341
countryFinland -6.343e+09 1.294e+10 -0.490 0.6242
countryFrance -7.761e+09 1.008e+10 -0.770 0.4418
countryGermany -2.739e+09 1.009e+10 -0.271 0.7862
countryIndia -5.475e+09 9.249e+09 -0.592 0.5541
countryIndonesia -9.007e+09 9.378e+09 -0.960 0.3371
countryIraq -4.161e+09 1.129e+10 -0.369 0.7125
countryItaly -5.134e+09 1.128e+10 -0.455 0.6490
countryJapan -1.006e+10 1.022e+10 -0.984 0.3254
countryJordan -8.943e+08 1.063e+10 -0.084 0.9330
countryKuwait -1.749e+10 1.300e+10 -1.346 0.1788
countryLatvia 6.334e+09 1.309e+10 0.484 0.6285
countryMalaysia -4.957e+09 1.297e+10 -0.382 0.7025
countryMexico -7.878e+09 9.343e+09 -0.843 0.3994
countryMorocco -9.206e+09 1.299e+10 -0.709 0.4788
countryNetherlands -6.812e+09 1.058e+10 -0.644 0.5198
countryPakistan 9.281e+08 1.009e+10 0.092 0.9268
countryPeru -1.011e+10 1.299e+10 -0.778 0.4367
countryPhilippines -4.713e+09 9.605e+09 -0.491 0.6238
countryRussia -3.773e+09 9.474e+09 -0.398 0.6906
countrySamoa -6.199e+09 1.298e+10 -0.478 0.6330
countrySaudi Arabia -5.844e+09 9.685e+09 -0.603 0.5464
countrySingapore 4.518e+08 1.126e+10 0.040 0.9680
countrySouth Korea -8.059e+09 9.474e+09 -0.851 0.3952
countrySpain -7.016e+09 9.378e+09 -0.748 0.4546
countrySweden -7.277e+09 1.029e+10 -0.707 0.4797
countrySwitzerland -9.209e+09 1.326e+10 -0.695 0.4875
countryThailand -1.844e+09 9.457e+09 -0.195 0.8455
countryTurkey 6.142e+09 1.027e+10 0.598 0.5501
countryUkraine -4.929e+09 9.775e+09 -0.504 0.6142
countryUnited Arab Emirates -5.757e+09 9.919e+09 -0.580 0.5618
countryUnited Kingdom -6.432e+09 9.302e+09 -0.692 0.4894
countryUnited States -5.973e+09 9.217e+09 -0.648 0.5172
countryVenezuela -1.205e+10 1.307e+10 -0.922 0.3568
countryVietnam -3.835e+09 1.065e+10 -0.360 0.7189
date -9.801e+04 2.206e+05 -0.444 0.6569
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9.131e+09 on 799 degrees of freedom
Multiple R-squared: 0.6496, Adjusted R-squared: 0.6224
F-statistic: 23.89 on 62 and 799 DF, p-value: < 2.2e-16
19. 비슷한 카테고리의 영상이 인기가 많은 국가를 클러스터링
category_country <- data %>%
group_by(category, country) %>%
summarise(total_views = sum(video_views), .groups = 'drop') %>%
arrange(desc(total_views))
20. lowest_monthly_earnings, highest_monthly_earnings 변수 이용해서 연간 수입 변화의 추세 시각화
earnings_trend <- data %>%
select(date, lowest_monthly_earnings, highest_monthly_earnings)
ggplot(earnings_trend, aes(x = date)) +
geom_line(aes(y = lowest_monthly_earnings, color = "Lowest Monthly Earnings")) +
geom_line(aes(y = highest_monthly_earnings, color = "Highest Monthly Earnings")) +
labs(title = "Annual Earnings Trend", x = "Date", y = "Monthly Earnings") +
scale_color_manual(values = c("Lowest Monthly Earnings" = "blue", "Highest Monthly Earnings" = "red"))
'Data visualization > 데이터시각화(R)' 카테고리의 다른 글
한국의 인기 YouTube 동영상 통계 (1) | 2023.11.23 |
---|---|
Data analysis of YouTube channels dataset in R (1) | 2023.11.21 |
데이터시각화(R)_Texas flight data (0) | 2023.11.11 |
데이터시각화(R)_탐색적 자료분석 EDA (0) | 2023.11.01 |
데이터시각화(R)_Tidy Data # (0) | 2023.10.29 |