Data analysis of YouTube channels dataset in R

Data visualization/데이터시각화(R)

Data analysis of YouTube channels dataset in R

뉴욕킴 2023. 11. 21. 23:44

#Load packages

install.packages("sf")
install.packages("here")
install.packages("skimr")
install.packages("janitor")
install.packages("maps")
install.packages("plotly")

library(readr)
library(dplyr)
library(here)
library(skimr)
library(janitor)
library(lubridate)
library(ggplot2)

library(maps)
library(plotly)
library(sf)

# Read data from file

data_to_clean <- read.csv('C:/Users/eznay/Downloads/Global YouTube Statistics.csv')

#Overwiew

head(data_to_clean)

install.packages("dplyr")
library(dplyr)
glimpse(data)

str(data_to_clean)

names(data_to_clean)

[1] "rank"                                   
 [2] "Youtuber"                               
 [3] "subscribers"                            
 [4] "video.views"                            
 [5] "category"                               
 [6] "Title"                                  
 [7] "uploads"                                
 [8] "Country"                                
 [9] "Abbreviation"                           
[10] "channel_type"                           
[11] "video_views_rank"                       
[12] "country_rank"                           
[13] "channel_type_rank"                      
[14] "video_views_for_the_last_30_days"       
[15] "lowest_monthly_earnings"                
[16] "highest_monthly_earnings"               
[17] "lowest_yearly_earnings"                 
[18] "highest_yearly_earnings"                
[19] "subscribers_for_last_30_days"           
[20] "created_year"                           
[21] "created_month"                          
[22] "created_date"                           
[23] "Gross.tertiary.education.enrollment...."
[24] "Population"                             
[25] "Unemployment.rate"                      
[26] "Urban_population"                       
[27] "Latitude"                               
[28] "Longitude"

install.packages("janitor")
library(janitor)

rename_with(data_to_clean, tolower) %>% clean_names() %>% names()

[1] "rank"                                "youtuber"                           
 [3] "subscribers"                         "video_views"                        
 [5] "category"                            "title"                              
 [7] "uploads"                             "country"                            
 [9] "abbreviation"                        "channel_type"                       
[11] "video_views_rank"                    "country_rank"                       
[13] "channel_type_rank"                   "video_views_for_the_last_30_days"   
[15] "lowest_monthly_earnings"             "highest_monthly_earnings"           
[17] "lowest_yearly_earnings"              "highest_yearly_earnings"            
[19] "subscribers_for_last_30_days"        "created_year"                       
[21] "created_month"                       "created_date"                       
[23] "gross_tertiary_education_enrollment" "population"                         
[25] "unemployment_rate"                   "urban_population"                   
[27] "latitude"                            "longitude"

# Check data integrity

# Data duplicates

duplicated_rows <- sum(duplicated(data_to_clean))
duplicated_rows

[1] 0

# Missing values

na_values <- colSums(is.na(data_to_clean))
na_values

  rank                                Youtuber 
                                      0                                       0 
                            subscribers                             video.views 
                                      0                                       0 
                               category                                   Title 
                                      0                                       0 
                                uploads                                 Country 
                                      0                                       0 
                           Abbreviation                            channel_type 
                                      0                                       0 
                       video_views_rank                            country_rank 
                                      1                                     116 
                      channel_type_rank        video_views_for_the_last_30_days 
                                     33                                      56 
                lowest_monthly_earnings                highest_monthly_earnings 
                                      0                                       0 
                 lowest_yearly_earnings                 highest_yearly_earnings 
                                      0                                       0 
           subscribers_for_last_30_days                            created_year 
                                    337                                       5 
                          created_month                            created_date 
                                      0                                       5 
Gross.tertiary.education.enrollment....                              Population 
                                    123                                     123 
                      Unemployment.rate                        Urban_population 
                                    123                                     123 
                               Latitude                               Longitude 
                                    123                                     123

# Cleaning data

na_rows <- data_to_clean[!complete.cases(data_to_clean), ]
head(na_rows)

data <- data_to_clean[complete.cases(data_to_clean$Latitude & data_to_clean$created_year & data_to_clean$channel_type_rank), ]

na_rows_full <- colSums(is.na(data))
na_rows_full

    rank                                Youtuber 
                                      0                                       0 
                            subscribers                             video.views 
                                      0                                       0 
                               category                                   Title 
                                      0                                       0 
                                uploads                                 Country 
                                      0                                       0 
                           Abbreviation                            channel_type 
                                      0                                       0 
                       video_views_rank                            country_rank 
                                      0                                       0 
                      channel_type_rank        video_views_for_the_last_30_days 
                                      0                                      15 
                lowest_monthly_earnings                highest_monthly_earnings 
                                      0                                       0 
                 lowest_yearly_earnings                 highest_yearly_earnings 
                                      0                                       0 
           subscribers_for_last_30_days                            created_year 
                                    268                                       0 
                          created_month                            created_date 
                                      0                                       0 
Gross.tertiary.education.enrollment....                              Population 
                                      0                                       0 
                      Unemployment.rate                        Urban_population 
                                      0                                       0 
                               Latitude                               Longitude 
                                      0                                       0

set_date <- '2023-08-05'

install.packages("lubridate")
library(lubridate)

data$date <- with(data, ymd(sprintf('%04d%03s%02d', created_year, created_month, created_date)))

data$subscribers_for_last_30_days[is.na(data$subscribers_for_last_30_days)] <- as.integer(round((data$subscribers / (interval(data$date, set_date) %/% months(1)))))

replacement_values <- as.integer(round((data$video.views / (interval(data$date, set_date) %/% months(1)))))
data$video_views_for_the_last_30_days[is.na(data$video_views_for_the_last_30_days)] <- replacement_values


na_rows_full <- colSums(is.na(data))
na_rows_full

 rank                                Youtuber 
                                      0                                       0 
                            subscribers                             video.views 
                                      0                                       0 
                               category                                   Title 
                                      0                                       0 
                                uploads                                 Country 
                                      0                                       0 
                           Abbreviation                            channel_type 
                                      0                                       0 
                       video_views_rank                            country_rank 
                                      0                                       0 
                      channel_type_rank        video_views_for_the_last_30_days 
                                      0                                       0 
                lowest_monthly_earnings                highest_monthly_earnings 
                                      0                                       0 
                 lowest_yearly_earnings                 highest_yearly_earnings 
                                      0                                       0 
           subscribers_for_last_30_days                            created_year 
                                      0                                       0 
                          created_month                            created_date 
                                      0                                       0 
Gross.tertiary.education.enrollment....                              Population 
                                      0                                       0 
                      Unemployment.rate                        Urban_population 
                                      0                                       0 
                               Latitude                               Longitude 
                                      0                                       0 
                                   date 
                                      0

# data visualization

library(ggplot2)
library(plotly)

# Check the distribution of 'video.views' and 'subscribers' columns

(# 'video.views'와 'subscribers' 컬럼의 분포 확인)

ggplot(data, aes(x = video.views)) +
  geom_histogram(fill = "skyblue", color = "white", bins = 30) +
  labs(title = "Video Views Distribution", x = "Video Views", y = "Count")

ggplot(data, aes(x = subscribers)) +
  geom_histogram(fill = "lightgreen", color = "white", bins = 30) +
  labs(title = "Subscribers Distribution", x = "Subscribers", y = "Count")

# Comparison of averages of 'video.views' and 'subscribers' by 'channel.type'

('channel.type'별 'video.views'와 'subscribers'의 평균 비교)

'Data visualization > 데이터시각화(R)' 카테고리의 다른 글

Data analysis of YouTube channels dataset in R (0)	2023.12.02
한국의 인기 YouTube 동영상 통계 (1)	2023.11.23
데이터시각화(R)_Texas flight data (0)	2023.11.11
데이터시각화(R)_탐색적 자료분석 EDA (0)	2023.11.01
데이터시각화(R)_Tidy Data ＃ (0)	2023.10.29

현재글Data analysis of YouTube channels dataset in R

250x250

데이터 공부하는 뉴욕킴

데이터 공부하는 뉴욕킴입니다. 데이터 사이언스 함께 공부해요😘

r, 데이터시각화, 태블로, SQL, 딥러닝, 빅데이터, 파이썬, 빅데이터공모전, 국비지원, 파이썬머신러닝완벽가이드, 데이터베이스, 이대데이터사이언스, 파이썬기초, 머신러닝, 머신러닝완벽가이드, 프로그래머스, 데이터, 패스트캠퍼스, 회귀, 데이터분석,

Today :
Yesterday :

데이터 공부하는 뉴욕킴