Data visualization/데이터시각화(R)

Data analysis of YouTube channels dataset in R

뉴욕킴 2023. 11. 21. 23:44

#Load packages

install.packages("sf")
install.packages("here")
install.packages("skimr")
install.packages("janitor")
install.packages("maps")
install.packages("plotly")

library(readr)
library(dplyr)
library(here)
library(skimr)
library(janitor)
library(lubridate)
library(ggplot2)

library(maps)
library(plotly)
library(sf)

 

# Read data from file

data_to_clean <- read.csv('C:/Users/eznay/Downloads/Global YouTube Statistics.csv')

 

#Overwiew

head(data_to_clean)

install.packages("dplyr")
library(dplyr)
glimpse(data)

 

str(data_to_clean)

names(data_to_clean)
[1] "rank"                                   
 [2] "Youtuber"                               
 [3] "subscribers"                            
 [4] "video.views"                            
 [5] "category"                               
 [6] "Title"                                  
 [7] "uploads"                                
 [8] "Country"                                
 [9] "Abbreviation"                           
[10] "channel_type"                           
[11] "video_views_rank"                       
[12] "country_rank"                           
[13] "channel_type_rank"                      
[14] "video_views_for_the_last_30_days"       
[15] "lowest_monthly_earnings"                
[16] "highest_monthly_earnings"               
[17] "lowest_yearly_earnings"                 
[18] "highest_yearly_earnings"                
[19] "subscribers_for_last_30_days"           
[20] "created_year"                           
[21] "created_month"                          
[22] "created_date"                           
[23] "Gross.tertiary.education.enrollment...."
[24] "Population"                             
[25] "Unemployment.rate"                      
[26] "Urban_population"                       
[27] "Latitude"                               
[28] "Longitude"            
install.packages("janitor")
library(janitor)
rename_with(data_to_clean, tolower) %>% clean_names() %>% names()
[1] "rank"                                "youtuber"                           
 [3] "subscribers"                         "video_views"                        
 [5] "category"                            "title"                              
 [7] "uploads"                             "country"                            
 [9] "abbreviation"                        "channel_type"                       
[11] "video_views_rank"                    "country_rank"                       
[13] "channel_type_rank"                   "video_views_for_the_last_30_days"   
[15] "lowest_monthly_earnings"             "highest_monthly_earnings"           
[17] "lowest_yearly_earnings"              "highest_yearly_earnings"            
[19] "subscribers_for_last_30_days"        "created_year"                       
[21] "created_month"                       "created_date"                       
[23] "gross_tertiary_education_enrollment" "population"                         
[25] "unemployment_rate"                   "urban_population"                   
[27] "latitude"                            "longitude"  

 

# Check data integrity

# Data duplicates

duplicated_rows <- sum(duplicated(data_to_clean))
duplicated_rows
[1] 0

 

# Missing values

na_values <- colSums(is.na(data_to_clean))
na_values
  rank                                Youtuber 
                                      0                                       0 
                            subscribers                             video.views 
                                      0                                       0 
                               category                                   Title 
                                      0                                       0 
                                uploads                                 Country 
                                      0                                       0 
                           Abbreviation                            channel_type 
                                      0                                       0 
                       video_views_rank                            country_rank 
                                      1                                     116 
                      channel_type_rank        video_views_for_the_last_30_days 
                                     33                                      56 
                lowest_monthly_earnings                highest_monthly_earnings 
                                      0                                       0 
                 lowest_yearly_earnings                 highest_yearly_earnings 
                                      0                                       0 
           subscribers_for_last_30_days                            created_year 
                                    337                                       5 
                          created_month                            created_date 
                                      0                                       5 
Gross.tertiary.education.enrollment....                              Population 
                                    123                                     123 
                      Unemployment.rate                        Urban_population 
                                    123                                     123 
                               Latitude                               Longitude 
                                    123                                     123 

 

# Cleaning data

na_rows <- data_to_clean[!complete.cases(data_to_clean), ]
head(na_rows)

data <- data_to_clean[complete.cases(data_to_clean$Latitude & data_to_clean$created_year & data_to_clean$channel_type_rank), ]

na_rows_full <- colSums(is.na(data))
na_rows_full
    rank                                Youtuber 
                                      0                                       0 
                            subscribers                             video.views 
                                      0                                       0 
                               category                                   Title 
                                      0                                       0 
                                uploads                                 Country 
                                      0                                       0 
                           Abbreviation                            channel_type 
                                      0                                       0 
                       video_views_rank                            country_rank 
                                      0                                       0 
                      channel_type_rank        video_views_for_the_last_30_days 
                                      0                                      15 
                lowest_monthly_earnings                highest_monthly_earnings 
                                      0                                       0 
                 lowest_yearly_earnings                 highest_yearly_earnings 
                                      0                                       0 
           subscribers_for_last_30_days                            created_year 
                                    268                                       0 
                          created_month                            created_date 
                                      0                                       0 
Gross.tertiary.education.enrollment....                              Population 
                                      0                                       0 
                      Unemployment.rate                        Urban_population 
                                      0                                       0 
                               Latitude                               Longitude 
                                      0                                       0 

 

set_date <- '2023-08-05'
install.packages("lubridate")
library(lubridate)
data$date <- with(data, ymd(sprintf('%04d%03s%02d', created_year, created_month, created_date)))

data$subscribers_for_last_30_days[is.na(data$subscribers_for_last_30_days)] <- as.integer(round((data$subscribers / (interval(data$date, set_date) %/% months(1)))))

replacement_values <- as.integer(round((data$video.views / (interval(data$date, set_date) %/% months(1)))))
data$video_views_for_the_last_30_days[is.na(data$video_views_for_the_last_30_days)] <- replacement_values


na_rows_full <- colSums(is.na(data))
na_rows_full
 rank                                Youtuber 
                                      0                                       0 
                            subscribers                             video.views 
                                      0                                       0 
                               category                                   Title 
                                      0                                       0 
                                uploads                                 Country 
                                      0                                       0 
                           Abbreviation                            channel_type 
                                      0                                       0 
                       video_views_rank                            country_rank 
                                      0                                       0 
                      channel_type_rank        video_views_for_the_last_30_days 
                                      0                                       0 
                lowest_monthly_earnings                highest_monthly_earnings 
                                      0                                       0 
                 lowest_yearly_earnings                 highest_yearly_earnings 
                                      0                                       0 
           subscribers_for_last_30_days                            created_year 
                                      0                                       0 
                          created_month                            created_date 
                                      0                                       0 
Gross.tertiary.education.enrollment....                              Population 
                                      0                                       0 
                      Unemployment.rate                        Urban_population 
                                      0                                       0 
                               Latitude                               Longitude 
                                      0                                       0 
                                   date 
                                      0 

 

# data visualization

library(ggplot2)
library(plotly)

 

# Check the distribution of 'video.views' and 'subscribers' columns

(# 'video.views'와 'subscribers' 컬럼의 분포 확인)

ggplot(data, aes(x = video.views)) +
  geom_histogram(fill = "skyblue", color = "white", bins = 30) +
  labs(title = "Video Views Distribution", x = "Video Views", y = "Count")

ggplot(data, aes(x = subscribers)) +
  geom_histogram(fill = "lightgreen", color = "white", bins = 30) +
  labs(title = "Subscribers Distribution", x = "Subscribers", y = "Count")

 

# Comparison of averages of 'video.views' and 'subscribers' by 'channel.type'

('channel.type'별 'video.views'와 'subscribers'의 평균 비교)