#Load packages
install.packages("sf")
install.packages("here")
install.packages("skimr")
install.packages("janitor")
install.packages("maps")
install.packages("plotly")
library(readr)
library(dplyr)
library(here)
library(skimr)
library(janitor)
library(lubridate)
library(ggplot2)
library(maps)
library(plotly)
library(sf)
# Read data from file
data_to_clean <- read.csv('C:/Users/eznay/Downloads/Global YouTube Statistics.csv')
#Overwiew
head(data_to_clean)
install.packages("dplyr")
library(dplyr)
glimpse(data)
str(data_to_clean)
names(data_to_clean)
[1] "rank"
[2] "Youtuber"
[3] "subscribers"
[4] "video.views"
[5] "category"
[6] "Title"
[7] "uploads"
[8] "Country"
[9] "Abbreviation"
[10] "channel_type"
[11] "video_views_rank"
[12] "country_rank"
[13] "channel_type_rank"
[14] "video_views_for_the_last_30_days"
[15] "lowest_monthly_earnings"
[16] "highest_monthly_earnings"
[17] "lowest_yearly_earnings"
[18] "highest_yearly_earnings"
[19] "subscribers_for_last_30_days"
[20] "created_year"
[21] "created_month"
[22] "created_date"
[23] "Gross.tertiary.education.enrollment...."
[24] "Population"
[25] "Unemployment.rate"
[26] "Urban_population"
[27] "Latitude"
[28] "Longitude"
install.packages("janitor")
library(janitor)
rename_with(data_to_clean, tolower) %>% clean_names() %>% names()
[1] "rank" "youtuber"
[3] "subscribers" "video_views"
[5] "category" "title"
[7] "uploads" "country"
[9] "abbreviation" "channel_type"
[11] "video_views_rank" "country_rank"
[13] "channel_type_rank" "video_views_for_the_last_30_days"
[15] "lowest_monthly_earnings" "highest_monthly_earnings"
[17] "lowest_yearly_earnings" "highest_yearly_earnings"
[19] "subscribers_for_last_30_days" "created_year"
[21] "created_month" "created_date"
[23] "gross_tertiary_education_enrollment" "population"
[25] "unemployment_rate" "urban_population"
[27] "latitude" "longitude"
# Check data integrity
# Data duplicates
duplicated_rows <- sum(duplicated(data_to_clean))
duplicated_rows
[1] 0
# Missing values
na_values <- colSums(is.na(data_to_clean))
na_values
rank Youtuber
0 0
subscribers video.views
0 0
category Title
0 0
uploads Country
0 0
Abbreviation channel_type
0 0
video_views_rank country_rank
1 116
channel_type_rank video_views_for_the_last_30_days
33 56
lowest_monthly_earnings highest_monthly_earnings
0 0
lowest_yearly_earnings highest_yearly_earnings
0 0
subscribers_for_last_30_days created_year
337 5
created_month created_date
0 5
Gross.tertiary.education.enrollment.... Population
123 123
Unemployment.rate Urban_population
123 123
Latitude Longitude
123 123
# Cleaning data
na_rows <- data_to_clean[!complete.cases(data_to_clean), ]
head(na_rows)
data <- data_to_clean[complete.cases(data_to_clean$Latitude & data_to_clean$created_year & data_to_clean$channel_type_rank), ]
na_rows_full <- colSums(is.na(data))
na_rows_full
rank Youtuber
0 0
subscribers video.views
0 0
category Title
0 0
uploads Country
0 0
Abbreviation channel_type
0 0
video_views_rank country_rank
0 0
channel_type_rank video_views_for_the_last_30_days
0 15
lowest_monthly_earnings highest_monthly_earnings
0 0
lowest_yearly_earnings highest_yearly_earnings
0 0
subscribers_for_last_30_days created_year
268 0
created_month created_date
0 0
Gross.tertiary.education.enrollment.... Population
0 0
Unemployment.rate Urban_population
0 0
Latitude Longitude
0 0
set_date <- '2023-08-05'
install.packages("lubridate")
library(lubridate)
data$date <- with(data, ymd(sprintf('%04d%03s%02d', created_year, created_month, created_date)))
data$subscribers_for_last_30_days[is.na(data$subscribers_for_last_30_days)] <- as.integer(round((data$subscribers / (interval(data$date, set_date) %/% months(1)))))
replacement_values <- as.integer(round((data$video.views / (interval(data$date, set_date) %/% months(1)))))
data$video_views_for_the_last_30_days[is.na(data$video_views_for_the_last_30_days)] <- replacement_values
na_rows_full <- colSums(is.na(data))
na_rows_full
rank Youtuber
0 0
subscribers video.views
0 0
category Title
0 0
uploads Country
0 0
Abbreviation channel_type
0 0
video_views_rank country_rank
0 0
channel_type_rank video_views_for_the_last_30_days
0 0
lowest_monthly_earnings highest_monthly_earnings
0 0
lowest_yearly_earnings highest_yearly_earnings
0 0
subscribers_for_last_30_days created_year
0 0
created_month created_date
0 0
Gross.tertiary.education.enrollment.... Population
0 0
Unemployment.rate Urban_population
0 0
Latitude Longitude
0 0
date
0
# data visualization
library(ggplot2)
library(plotly)
# Check the distribution of 'video.views' and 'subscribers' columns
(# 'video.views'와 'subscribers' 컬럼의 분포 확인)
ggplot(data, aes(x = video.views)) +
geom_histogram(fill = "skyblue", color = "white", bins = 30) +
labs(title = "Video Views Distribution", x = "Video Views", y = "Count")
ggplot(data, aes(x = subscribers)) +
geom_histogram(fill = "lightgreen", color = "white", bins = 30) +
labs(title = "Subscribers Distribution", x = "Subscribers", y = "Count")
# Comparison of averages of 'video.views' and 'subscribers' by 'channel.type'
('channel.type'별 'video.views'와 'subscribers'의 평균 비교)
'Data visualization > 데이터시각화(R)' 카테고리의 다른 글
Data analysis of YouTube channels dataset in R (0) | 2023.12.02 |
---|---|
한국의 인기 YouTube 동영상 통계 (1) | 2023.11.23 |
데이터시각화(R)_Texas flight data (0) | 2023.11.11 |
데이터시각화(R)_탐색적 자료분석 EDA (0) | 2023.11.01 |
데이터시각화(R)_Tidy Data # (0) | 2023.10.29 |