OBJECTIVE :
The purpose of this model is to analyse the pattern and spread of the COVID-19 from January 2020 onwards. A variety of packages were used for this exercise.
#install.packages("kableExtra")
suppressMessages(library(magrittr)) # pipe operations
suppressMessages(library(lubridate)) # date operations
suppressMessages(library(tidyverse)) # ggplot2, tidyr, dplyr...
suppressMessages(library(gridExtra)) # multiple grid-based plots on a page
suppressMessages(library(ggforce)) # accelerating ggplot2
suppressMessages(library(kableExtra)) # complex tables
suppressMessages(library(leaflet)) #for map
suppressMessages(library(plotly)) #plotly
Data Ingestion :
Reading data from the COVID-19 folder which gets updated everyday. It contains data for the whole world.
confirmed <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
death <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
recovered <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")
Verifying the data and changing the data into the desirable format. Data Cleaning , Manipulation and Visualisation was performed. We can see the bigger radius denotes the countries which have the highest number of people affected by COVID-19.
#confirmed[1:10, 1:10]
col <- ncol(confirmed)
## get dates from column names
dates <- names(confirmed)[5:col] %>% substr(2,8) %>% mdy()
#range(dates)
## [1] "2020-01-22" "2020-03-22"
min_date <- min(dates)
max_date <- max(dates)
min_date_formt <- min_date %>% format('%d %b %Y')
max_date_formt <- max_date %>% format('%d %b %Y')
cleanData <- function(data) {
## remove some columns
data %<>% select(-c(Province.State, Lat, Long)) %>% rename(country=Country.Region)
## convert from wide to long format
data %<>% gather(key=date, value=count, -country)
## convert from character to date
data %<>% mutate(date = date %>% substr(2,8) %>% mdy())
## aggregate by country
data %<>% group_by(country, date) %>% summarise(count=sum(count, na.rm=T)) %>% as.data.frame()
return(data)
}
data_confirmed <- confirmed %>% cleanData() %>% rename(confirmed=count)
data_deaths <- death %>% cleanData() %>% rename(deaths=count)
data_recovered <- recovered %>% cleanData() %>% rename(recovered=count)
## merge above 3 datasets into one, by country and date
data <- data_confirmed %>% merge(data_deaths) %>% merge(data_recovered)
## countries/regions with confirmed cases, excl. cruise ships
countries <- data %>% pull(country) %>% setdiff('Cruise Ship')
We can get the top few countries with the highest total confirmed cases of CoronaVirus till yesterday. The analysis shows that the US has become the epicentre for the COVID-19.
head(arrange(data_confirmed,desc(confirmed)), n = 50) %>% filter(date=='2020-05-17') %>% select(country, confirmed) %>% group_by(country)
## # A tibble: 3 x 2
## # Groups: country [3]
## country confirmed
## <fct> <int>
## 1 US 1486757
## 2 Russia 281752
## 3 United Kingdom 244995
This plot here presents the analysis of confirmed, death and recovered cases in India. While we can see that the data is rising exponentially, we could also see that the recovered have also increased.
## first 10 records when it first broke out in India
Ind <- data %>% filter(country=='India')
p <-ggplot(data= Ind, mapping = aes(x= date, y= confirmed)) + geom_bar(stat= "identity", fill = "#0000FF") + ggtitle("India COVID-19 ANALYSIS")
p1 <- ggplot(data= Ind, mapping = aes(x= date, y= deaths)) + geom_bar(stat= "identity", fill = "#CC0000")
p2 <- ggplot(data= Ind, mapping = aes(x= date, y= recovered)) + geom_bar(stat= "identity", fill = "#00FF00")
subplot(p, p1, p2, margin = 0.1, nrows = 3, titleY = TRUE)
This plot here presents the analysis of confirmed, death and recovered cases in Ireland. We can see that the mortality cases here are rising high .
Ire <- data %>% filter(country=='Ireland')
p3 <-ggplot(data= Ire, mapping = aes(x= date, y= confirmed)) + geom_bar(stat= "identity", fill = "#0000FF")+ ggtitle("Ireland COVID-19 ANALYSIS")
p4 <- ggplot(data= Ire, mapping = aes(x= date, y= deaths)) + geom_bar(stat= "identity", fill = "#CC0000")
p5 <- ggplot(data= Ire, mapping = aes(x= date, y= recovered)) + geom_bar(stat= "identity", fill = "#00FF00")
subplot(p3, p4, p5, margin = 0.1, nrows = 3, titleY = TRUE)
Visualizing data in the form of Map
## counts for the whole world
data_world <- data %>% group_by(date) %>%
summarise(country='World',
confirmed = sum(confirmed),
deaths = sum(deaths),
recovered = sum(recovered))
data %<>% rbind(data_world)
## current confirmed cases
data %<>% mutate(current_confirmed = confirmed - deaths - recovered)
## select last column, which is the number of latest confirmed cases
x <- confirmed
x$confirmed <- x[, ncol(x)]
x %<>% select(c(Country.Region, Province.State, Lat, Long, confirmed)) %>%
mutate(txt=paste0(Country.Region, ' - ', Province.State, ': ', confirmed))
m <- leaflet(width=1200, height=800) %>% addTiles()
# circle marker (units in pixels)
m %<>% addCircleMarkers(x$Long, x$Lat,
radius=2+log2(x$confirmed), stroke=F,
color='red', fillOpacity=0.3,
popup=x$txt)
# world
m