CDC Data Exercise

Author

Elizabeth Hall

<<<<<<< Updated upstream

About the Data

This dataset is the ‘Monthly Provisional Counts of Deaths by Select Causes, 2020-2023’ though for the purpose of this exercise I am only using data from 2020-2022.

The dataset can be found here:

https://data.cdc.gov/NCHS/Monthly-Provisional-Counts-of-Deaths-by-Select-Cau/9dzk-mvmi/about_data

After cleaning the dataset contains the following list of variables:

  • Jurisdiction of Occurrence
  • Year
  • Month
  • Number Of Days
  • All Cause
  • Natural Cause
  • Septicemia
  • Malignant Neoplasms
  • Diabetes Mellitus
  • Alzheimer Disease
  • Influenza and Pneumonia
  • Chronic Lower Respiratory Diseases
  • Other Diseases of Respiratory System
  • Nephritis/Nephrotic Syndrome and Nephrosis
  • Abnormal Findings (No Classifiable Diagnosis)
  • Diseases of Heart
  • Cerebrovascular Diseases
  • Accidents/Unintentional Injuries
  • Motor Vehicle Accidents
  • Intentional Self Harm/Suicide
  • Assault/Homicide
  • Drug Overdose
  • COVID 19/Multiple Cause of Death
  • COVID 19/Underlying Cause of Death

Cleaning the Dataset

Load required package and load dataset.

# load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)
library(knitr)
library(kableExtra)
library(here)

# Specify the file path relative to the working directory
file_path <- here("cdcdata-exercise/causeofdeathdata.csv")

# Load the CSV file into a data frame
cause_of_death_data_clean <- read.csv(file_path, stringsAsFactors = FALSE)

Creating a new variable, to prepare for the removal of redundant variables in the next step.

# Creating Number.Of.Days variable so that Start.Date and End.Date can be removed
# Month and Year are already variables, so Start.Date and End.Date become somewhat redundant
cause_of_death_data_clean$Number.Of.Days <- as.numeric(
  as.Date(cause_of_death_data_clean$End.Date, format = "%m/%d/%Y") - 
    as.Date(cause_of_death_data_clean$Start.Date, format = "%m/%d/%Y")
)
cause_of_death_data_clean <- cause_of_death_data_clean %>%
  dplyr::select(Jurisdiction.of.Occurrence, Year, Month, Number.Of.Days, everything())

Removing variables which contain junk text, and also getting rid of rows which contain no data. I also chose to filter out all data from 2023, since it was incomplete.

# Removing variables which display only 'Data not shown (6 month lag)'
# Removing Start.Date, End.Date, and Data.As.Of variables
cause_of_death_data_clean <- subset(cause_of_death_data_clean, select = -c(flag_accid, flag_mva, flag_suic, flag_homic, flag_drugod,Start.Date,End.Date,Data.As.Of))

# Removing rows with any NA values
cause_of_death_data_clean <- cause_of_death_data_clean[complete.cases(cause_of_death_data_clean), ]

# Filtering out data from the year 2023 because it is incomplete
cause_of_death_data_clean <- cause_of_death_data_clean %>%
  filter(Year != 2023)

Because of how variable names were formatted within the dataset, I added some code to make them more readable. I also altered the name of one variable which was very long and not practical for display purposes.

# Cleaning up variable names

clean_variable_names <- function(name) {
  name <- gsub("\\.+", " ", gsub("\\.\\.", "/", name))
  name <- gsub("Symptoms/Signs and Abnormal Clinical and Laboratory Findings/Not Elsewhere Classified", "Abnormal Findings (No Classifiable Diagnosis)", name)
  return(name)
}

cause_of_death_data_clean <- cause_of_death_data_clean %>%
  rename_with(clean_variable_names, everything())

Visualizing the Data

Calculating percentages for the total count for each cause of death.

# Calculate percentages total
cause_counts_total <- cause_of_death_data_clean %>%
  select(-c(`All Cause`, Year, Month, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`) %>%
  group_by(`Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  mutate(percentage = total_count / sum(total_count) * 100) %>%
  arrange(desc(total_count))

Creating a pie chart for the overall total for each cause of death.

# Create pie chart 
pie_chart_total <- ggplot(cause_counts_total, aes(x = "", y = total_count, fill = `Cause of Death`)) +
  geom_bar(stat = "identity") +
  coord_polar("y", start = 0) +
  labs(title = "Distribution of Causes of Death",
       fill = "Cause of Death",
       x = NULL, y = NULL,
       caption = "Data source: CDC") +
  theme_void() +
  theme(legend.position = "right",
        legend.text = element_text(size = 8),  
        legend.title = element_text(size = 10),  
        legend.key.size = unit(0.5, "lines"), 
        plot.title = element_text(size = 16),  
        plot.margin = margin(2, 6, 2, 2, "cm"),
        legend.box.margin = margin(0, -10, 0, 0)) +  
  guides(fill = guide_legend(
    keywidth = unit(0.5, "lines"),  
    label.position = "right",       
    label.hjust = 0                 
  )) +
  scale_fill_discrete(labels = paste0(cause_counts_total$`Cause of Death`, " (", round(cause_counts_total$percentage), "%)"))

# Adjustments
pie_chart_total <- pie_chart_total + theme(
  plot.margin = margin(2, 2, 2, 2, "cm"),
  plot.title = element_text(size = 16, hjust = 0.5, margin = margin(0, 0, 10, 0)),
  plot.caption = element_text(size = 10, hjust = 0.5, margin = margin(10, 0, 0, 0))
)

# Show pie chart
print(pie_chart_total)


Grouping by month and cause of death, calculating total deaths per month, and calculating the percentage of total deaths each month. This is for graphing purposes.

# Group by month and cause of death
cause_counts_month <- cause_of_death_data_clean %>%
  select(-c(`All Cause`, Year, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`, -Month) %>%
  group_by(Month, `Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  mutate(Month = factor(month.name[Month], levels = month.name)) %>%
  arrange(Month, desc(total_count))

# Calculate total deaths for each month
total_deaths_month <- cause_counts_month %>%
  group_by(Month) %>%
  summarise(total_deaths = sum(total_count))

# Calculate percentage of total deaths for each month
total_deaths_month <- total_deaths_month %>%
  mutate(percentage = total_deaths / sum(total_deaths) * 100)

Plot stacked bar plot for causes of death per month.

# Create a stacked bar plot for causes of death by month
ggplot(cause_counts_month, aes(x = Month, y = total_count/1e6, fill = `Cause of Death`)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) + 
  labs(title = "Total Causes of Death by Month",
       x = "Month",
       y = "Total Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5))


Plot bar graph for total number of deaths per month.

# Plot the bar graph for total number of deaths per month.
ggplot(total_deaths_month, aes(x = Month, y = total_deaths/1e6, fill = Month)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) +
  labs(title = "Total Deaths by Month",
       x = "Month",
       y = "Total Deaths") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")


Print table for total deaths per month.

# Print table of total deaths per month.
kable(total_deaths_month, 
      col.names = c("Month", "Total Deaths", "Percentage"),
      format = "html",
      digits = 2,
      caption = "Total Deaths and Percentage by Month") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(height = "200px")
Total Deaths and Percentage by Month
Month Total Deaths Percentage
January 1973713 10.49
February 1531031 8.14
March 1452402 7.72
April 1537652 8.17
May 1442009 7.66
June 1324227 7.04
July 1440256 7.65
August 1560359 8.29
September 1527357 8.12
October 1537261 8.17
November 1595771 8.48
December 1896740 10.08

Grouping by year and cause of death, calculating total deaths per year, and calculating the percentage of total deaths each year. This is for graphing purposes.

# Group by year and cause of death
cause_counts_year <- cause_of_death_data_clean %>%
  select(-c(`All Cause`, Month, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`, -Year) %>%
  group_by(Year, `Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  arrange(Year, desc(total_count))

# Calculate total deaths for each year
total_deaths_year <- cause_counts_year %>%
  group_by(Year) %>%
  summarise(total_deaths = sum(total_count))

# Calculate percentage of total deaths for each year
total_deaths_year <- total_deaths_year %>%
  mutate(percentage = total_deaths / sum(total_deaths) * 100)

Plot bar graph for total number of death per year.

# Plot bar graph for total number of deaths per year.
ggplot(total_deaths_year, aes(x = Year, y = total_deaths/1e6, fill = as.factor(Year))) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) +
  labs(title = "Total Deaths by Year",
       x = "Year",
       y = "Total Deaths") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")


Print table for total deaths per year.

# Print table for total deaths per year.
kable(total_deaths_year, 
      col.names = c("Year", "Total Deaths", "Percentage"),
      format = "html",
      digits = 2,
      caption = "Total Deaths and Percentage by Year") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(height = "200px")
Total Deaths and Percentage by Year
Year Total Deaths Percentage
2020 6326855 33.62
2021 6544637 34.78
2022 5947286 31.60

Grouping by month, year, and cause of death, calculating total deaths per year per month, and calculating the percentage of total deaths each year and month. This is for graphing purposes.

# Group by year, month, and cause of death
cause_counts_year_month <- cause_of_death_data_clean %>%
  select(-c(`All Cause`, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`, -Year, -Month) %>%
  group_by(Year, Month, `Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  arrange(Year, Month, desc(total_count))

# Calculate total deaths for each year and month
total_deaths_year_month <- cause_counts_year_month %>%
  group_by(Year, Month) %>%
  summarise(total_deaths = sum(total_count))

# Calculate percentage of total deaths for each year and month
total_deaths_year_month <- total_deaths_year_month %>%
  mutate(percentage = total_deaths / sum(total_deaths) * 100)

Plot stacked bar plot for causes of death per month per year.

# Create a stacked bar plot for causes of death by month and year
ggplot(cause_counts_year_month, aes(x = Month, y = total_count/1e6, fill = `Cause of Death`)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Year) +  # facet by year
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) + # format y-axis labels
  labs(title = "Total Causes of Death by Month & Year",
       x = "Month",
       y = "Total Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5))


Print table for total deaths per month per year.

# Print table for total deaths per month per year.
kable(total_deaths_year_month, 
      col.names = c("Year", "Month", "Total Deaths", "Percentage"),
      format = "html",
      digits = 2,
      caption = "Total Deaths and Percentage by Year and Month") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(height = "200px")
Total Deaths and Percentage by Year and Month
Year Month Total Deaths Percentage
2020 1 458159 7.24
2020 2 424927 6.72
2020 3 476820 7.54
2020 4 640048 10.12
2020 5 532282 8.41
2020 6 453854 7.17
2020 7 519117 8.20
2020 8 513981 8.12
2020 9 465822 7.36
2020 10 501311 7.92
2020 11 585911 9.26
2020 12 754623 11.93
2021 1 774913 11.84
2021 2 545362 8.33
2021 3 495046 7.56
2021 4 466178 7.12
2021 5 462739 7.07
2021 6 431520 6.59
2021 7 457288 6.99
2021 8 584022 8.92
2021 9 616758 9.42
2021 10 568395 8.68
2021 11 535910 8.19
2021 12 606506 9.27
2022 1 740641 12.45
2022 2 560742 9.43
2022 3 480536 8.08
2022 4 431426 7.25
2022 5 446988 7.52
2022 6 438853 7.38
2022 7 463851 7.80
2022 8 462356 7.77
2022 9 444777 7.48
2022 10 467555 7.86
2022 11 473950 7.97
2022 12 535611 9.01

This section is contributed by Chaohua Li

Create synthetic data

We create a new dataset by scrambling the data from the original dataset. That means the values in each variable are sampled from the old values without replacement. Since the year, month, and days are considered the id for each observation, these variables won’t be scrambled.

#Create data set left that contains jurisdiction, year, month and days 
left<-cause_of_death_data_clean[,c(1:4)]
#Create data set right that contains numbers of deaths for different causes
right<-cause_of_death_data_clean[,-c(1:4)]
#set seed for reproducible results
set.seed(456)
#define a new data frame synth that will contain scrambled values
synth <- right
#use a loop to scramble values without replacement in the dataset right 
for (col in colnames(right)) {
  #sample values without replacement for each variable
  synth[[col]] <- sample(right[[col]], replace = FALSE)
}
#combine dataset left with the scrambled data right
synth2 <- cbind(left, synth)

Summarizes and explores the synthetic data

Calculating percentages for the total count for each cause of death.

# Calculate percentages total
cause_counts_total <- synth2 %>%
  select(-c(`All Cause`, Year, Month, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`) %>%
  group_by(`Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  mutate(percentage = total_count / sum(total_count) * 100) %>%
  arrange(desc(total_count))

Recreating a pie chart for the overall total for each cause of death.

# Create pie chart 
pie_chart_total <- ggplot(cause_counts_total, aes(x = "", y = total_count, fill = `Cause of Death`)) +
  geom_bar(stat = "identity") +
  coord_polar("y", start = 0) +
  labs(title = "Synthetic Data: Distribution of \nCauses of Death",
       fill = "Cause of Death",
       x = NULL, y = NULL,) +
  theme_void() +
  theme(legend.position = "right",
        legend.text = element_text(size = 8),  
        legend.title = element_text(size = 10),  
        legend.key.size = unit(0.5, "lines"), 
        plot.title = element_text(size = 16),  
        plot.margin = margin(2, 6, 2, 2, "cm"),
        legend.box.margin = margin(0, -10, 0, 0)) +  
  guides(fill = guide_legend(
    keywidth = unit(0.5, "lines"),  
    label.position = "right",       
    label.hjust = 0                 
  )) +
  scale_fill_discrete(labels = paste0(cause_counts_total$`Cause of Death`, " (", round(cause_counts_total$percentage), "%)"))

# Adjustments
pie_chart_total <- pie_chart_total + theme(
  plot.margin = margin(2, 2, 2, 2, "cm"),
  plot.title = element_text(size = 16, hjust = 0.5, margin = margin(0, 0, 10, 0)),
  plot.caption = element_text(size = 10, hjust = 0.5, margin = margin(10, 0, 0, 0))
)

# Show pie chart
print(pie_chart_total)

Because each cause of death in the synthetic dataset contains exactly the same group of values, so the totals for each cause are the same with the original dataset.


Grouping by month and cause of death, calculating total deaths per month, and calculating the percentage of total deaths each month. This is for graphing purposes.

# Group by month and cause of death
cause_counts_month <- synth2 %>%
  select(-c(`All Cause`, Year, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`, -Month) %>%
  group_by(Month, `Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  mutate(Month = factor(month.name[Month], levels = month.name)) %>%
  arrange(Month, desc(total_count))

# Calculate total deaths for each month
total_deaths_month <- cause_counts_month %>%
  group_by(Month) %>%
  summarise(total_deaths = sum(total_count))

# Calculate percentage of total deaths for each month
total_deaths_month <- total_deaths_month %>%
  mutate(percentage = total_deaths / sum(total_deaths) * 100)

Plot stacked bar plot for causes of death per month.

# Create a stacked bar plot for causes of death by month
ggplot(cause_counts_month, aes(x = Month, y = total_count/1e6, fill = `Cause of Death`)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) + 
  labs(title = "Synthetic Data: Total Causes of \nDeath by Month",
       x = "Month",
       y = "Total Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5))

This bar plot looks different from the one using original data, because the random sampling broke the association between month and deaths due to different causes.

Plot bar graph for total number of deaths per month.

# Plot the bar graph for total number of deaths per month.
ggplot(total_deaths_month, aes(x = Month, y = total_deaths/1e6, fill = Month)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) +
  labs(title = "Synthetic Data:Total Deaths by Month",
       x = "Month",
       y = "Total Deaths") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")


Print table for total deaths per month.

# Print table of total deaths per month.
kable(total_deaths_month, 
      col.names = c("Month", "Total Deaths", "Percentage"),
      format = "html",
      digits = 2,
      caption = "Synthetic Data:Total Deaths and Percentage by Month") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(height = "200px")
Synthetic Data:Total Deaths and Percentage by Month
Month Total Deaths Percentage
January 1431480 7.61
February 1595078 8.48
March 1465555 7.79
April 1480724 7.87
May 1504505 7.99
June 1646561 8.75
July 1750683 9.30
August 1623654 8.63
September 1619599 8.61
October 1564062 8.31
November 1572547 8.36
December 1564330 8.31

Differences in the distribution of deaths by month between original and synthetic data are also reflected in this table.


Grouping by year and cause of death, calculating total deaths per year, and calculating the percentage of total deaths each year. This is for graphing purposes.

# Group by year and cause of death
cause_counts_year <- synth2 %>%
  select(-c(`All Cause`, Month, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`, -Year) %>%
  group_by(Year, `Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  arrange(Year, desc(total_count))

# Calculate total deaths for each year
total_deaths_year <- cause_counts_year %>%
  group_by(Year) %>%
  summarise(total_deaths = sum(total_count))

# Calculate percentage of total deaths for each year
total_deaths_year <- total_deaths_year %>%
  mutate(percentage = total_deaths / sum(total_deaths) * 100)

Plot bar graph for total number of death per year.

# Plot bar graph for total number of deaths per year.
ggplot(total_deaths_year, aes(x = Year, y = total_deaths/1e6, fill = as.factor(Year))) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) +
  labs(title = "Synthetic Data:Total Deaths by Year",
       x = "Year",
       y = "Total Deaths") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

The distribution of deaths by year using synthetic data is very similar to that in the original data. But the table below does show the numbers are slightly different from the original results.


Print table for total deaths per year.

# Print table for total deaths per year.
kable(total_deaths_year, 
      col.names = c("Year", "Total Deaths", "Percentage"),
      format = "html",
      digits = 2,
      caption = "Synthetic Data:Total Deaths and Percentage by Year") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(height = "200px")
Synthetic Data:Total Deaths and Percentage by Year
Year Total Deaths Percentage
2020 6193867 32.91
2021 6531616 34.71
2022 6093295 32.38

Grouping by month, year, and cause of death, calculating total deaths per year per month, and calculating the percentage of total deaths each year and month. This is for graphing purposes.

# Group by year, month, and cause of death
cause_counts_year_month <- synth2 %>%
  select(-c(`All Cause`, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`, -Year, -Month) %>%
  group_by(Year, Month, `Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  arrange(Year, Month, desc(total_count))

# Calculate total deaths for each year and month
total_deaths_year_month <- cause_counts_year_month %>%
  group_by(Year, Month) %>%
  summarise(total_deaths = sum(total_count))

# Calculate percentage of total deaths for each year and month
total_deaths_year_month <- total_deaths_year_month %>%
  mutate(percentage = total_deaths / sum(total_deaths) * 100)

Plot stacked bar plot for causes of death per month per year.

# Create a stacked bar plot for causes of death by month and year
ggplot(cause_counts_year_month, aes(x = Month, y = total_count/1e6, fill = `Cause of Death`)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Year) +  # facet by year
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) + # format y-axis labels
  labs(title = "Synthetic Data:Total Causes of Death \nby Month & Year",
       x = "Month",
       y = "Total Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5))

The distribution of deaths by year and month using synthetic data looks different from the original results. This is due to the random sampling process which broke the original pattern.


Print table for total deaths per month per year.

# Print table for total deaths per month per year.
kable(total_deaths_year_month, 
      col.names = c("Year", "Month", "Total Deaths", "Percentage"),
      format = "html",
      digits = 2,
      caption = "Synthetic Data:Total Deaths and Percentage \nby Year and Month") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(height = "600px")
Synthetic Data:Total Deaths and Percentage by Year and Month
Year Month Total Deaths Percentage
2020 1 472058 7.62
2020 2 497446 8.03
2020 3 524118 8.46
2020 4 500293 8.08
2020 5 518146 8.37
2020 6 561612 9.07
2020 7 492192 7.95
2020 8 477343 7.71
2020 9 589829 9.52
2020 10 526264 8.50
2020 11 528635 8.53
2020 12 505931 8.17
2021 1 488815 7.48
2021 2 606530 9.29
2021 3 461131 7.06
2021 4 478082 7.32
2021 5 537713 8.23
2021 6 561448 8.60
2021 7 703027 10.76
2021 8 524714 8.03
2021 9 546985 8.37
2021 10 517421 7.92
2021 11 532602 8.15
2021 12 573148 8.77
2022 1 470607 7.72
2022 2 491102 8.06
2022 3 480306 7.88
2022 4 502349 8.24
2022 5 448646 7.36
2022 6 523501 8.59
2022 7 555464 9.12
2022 8 621597 10.20
2022 9 482785 7.92
2022 10 520377 8.54
2022 11 511310 8.39
2022 12 485251 7.96

=======

About the Data

This dataset is the ‘Monthly Provisional Counts of Deaths by Select Causes, 2020-2023’ though for the purpose of this exercise I am only using data from 2020-2022.

The dataset can be found here:

https://data.cdc.gov/NCHS/Monthly-Provisional-Counts-of-Deaths-by-Select-Cau/9dzk-mvmi/about_data

After cleaning the dataset contains the following list of variables:

  • Jurisdiction of Occurrence
  • Year
  • Month
  • Number Of Days
  • All Cause
  • Natural Cause
  • Septicemia
  • Malignant Neoplasms
  • Diabetes Mellitus
  • Alzheimer Disease
  • Influenza and Pneumonia
  • Chronic Lower Respiratory Diseases
  • Other Diseases of Respiratory System
  • Nephritis/Nephrotic Syndrome and Nephrosis
  • Abnormal Findings (No Classifiable Diagnosis)
  • Diseases of Heart
  • Cerebrovascular Diseases
  • Accidents/Unintentional Injuries
  • Motor Vehicle Accidents
  • Intentional Self Harm/Suicide
  • Assault/Homicide
  • Drug Overdose
  • COVID 19/Multiple Cause of Death
  • COVID 19/Underlying Cause of Death

Cleaning the Dataset

Load required package and load dataset.

# load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)
library(knitr)
library(kableExtra)
library(here)

# Specify the file path relative to the working directory
file_path <- "cdcdata-exercise/causeofdeathdata.csv"

# Load the CSV file into a data frame
cause_of_death_data_clean <- read.csv(here("cdcdata-exercise", "causeofdeathdata.csv"), stringsAsFactors = FALSE)

Creating a new variable, to prepare for the removal of redundant variables in the next step.

# Creating Number.Of.Days variable so that Start.Date and End.Date can be removed
# Month and Year are already variables, so Start.Date and End.Date become somewhat redundant
cause_of_death_data_clean$Number.Of.Days <- as.numeric(
  as.Date(cause_of_death_data_clean$End.Date, format = "%m/%d/%Y") - 
    as.Date(cause_of_death_data_clean$Start.Date, format = "%m/%d/%Y")
)
cause_of_death_data_clean <- cause_of_death_data_clean %>%
  dplyr::select(Jurisdiction.of.Occurrence, Year, Month, Number.Of.Days, everything())

Removing variables which contain junk text, and also getting rid of rows which contain no data. I also chose to filter out all data from 2023, since it was incomplete.

# Removing variables which display only 'Data not shown (6 month lag)'
# Removing Start.Date, End.Date, and Data.As.Of variables
cause_of_death_data_clean <- subset(cause_of_death_data_clean, select = -c(flag_accid, flag_mva, flag_suic, flag_homic, flag_drugod,Start.Date,End.Date,Data.As.Of))

# Removing rows with any NA values
cause_of_death_data_clean <- cause_of_death_data_clean[complete.cases(cause_of_death_data_clean), ]

# Filtering out data from the year 2023 because it is incomplete
cause_of_death_data_clean <- cause_of_death_data_clean %>%
  filter(Year != 2023)

Because of how variable names were formatted within the dataset, I added some code to make them more readable. I also altered the name of one variable which was very long and not practical for display purposes.

# Cleaning up variable names

clean_variable_names <- function(name) {
  name <- gsub("\\.+", " ", gsub("\\.\\.", "/", name))
  name <- gsub("Symptoms/Signs and Abnormal Clinical and Laboratory Findings/Not Elsewhere Classified", "Abnormal Findings (No Classifiable Diagnosis)", name)
  return(name)
}

cause_of_death_data_clean <- cause_of_death_data_clean %>%
  rename_with(clean_variable_names, everything())

Visualizing the Data

Calculating percentages for the total count for each cause of death.

# Calculate percentages total
cause_counts_total <- cause_of_death_data_clean %>%
  select(-c(`All Cause`, Year, Month, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`) %>%
  group_by(`Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  mutate(percentage = total_count / sum(total_count) * 100) %>%
  arrange(desc(total_count))

Creating a pie chart for the overall total for each cause of death.

# Create pie chart 
pie_chart_total <- ggplot(cause_counts_total, aes(x = "", y = total_count, fill = `Cause of Death`)) +
  geom_bar(stat = "identity") +
  coord_polar("y", start = 0) +
  labs(title = "Distribution of Causes of Death",
       fill = "Cause of Death",
       x = NULL, y = NULL,
       caption = "Data source: CDC") +
  theme_void() +
  theme(legend.position = "right",
        legend.text = element_text(size = 8),  
        legend.title = element_text(size = 10),  
        legend.key.size = unit(0.5, "lines"), 
        plot.title = element_text(size = 16),  
        plot.margin = margin(2, 6, 2, 2, "cm"),
        legend.box.margin = margin(0, -10, 0, 0)) +  
  guides(fill = guide_legend(
    keywidth = unit(0.5, "lines"),  
    label.position = "right",       
    label.hjust = 0                 
  )) +
  scale_fill_discrete(labels = paste0(cause_counts_total$`Cause of Death`, " (", round(cause_counts_total$percentage), "%)"))

# Adjustments
pie_chart_total <- pie_chart_total + theme(
  plot.margin = margin(2, 2, 2, 2, "cm"),
  plot.title = element_text(size = 16, hjust = 0.5, margin = margin(0, 0, 10, 0)),
  plot.caption = element_text(size = 10, hjust = 0.5, margin = margin(10, 0, 0, 0))
)

# Show pie chart
print(pie_chart_total)


Grouping by month and cause of death, calculating total deaths per month, and calculating the percentage of total deaths each month. This is for graphing purposes.

# Group by month and cause of death
cause_counts_month <- cause_of_death_data_clean %>%
  select(-c(`All Cause`, Year, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`, -Month) %>%
  group_by(Month, `Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  mutate(Month = factor(month.name[Month], levels = month.name)) %>%
  arrange(Month, desc(total_count))

# Calculate total deaths for each month
total_deaths_month <- cause_counts_month %>%
  group_by(Month) %>%
  summarise(total_deaths = sum(total_count))

# Calculate percentage of total deaths for each month
total_deaths_month <- total_deaths_month %>%
  mutate(percentage = total_deaths / sum(total_deaths) * 100)

Plot stacked bar plot for causes of death per month.

# Create a stacked bar plot for causes of death by month
ggplot(cause_counts_month, aes(x = Month, y = total_count/1e6, fill = `Cause of Death`)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) + 
  labs(title = "Total Causes of Death by Month",
       x = "Month",
       y = "Total Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5))


Plot bar graph for total number of deaths per month.

# Plot the bar graph for total number of deaths per month.
ggplot(total_deaths_month, aes(x = Month, y = total_deaths/1e6, fill = Month)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) +
  labs(title = "Total Deaths by Month",
       x = "Month",
       y = "Total Deaths") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")


Print table for total deaths per month.

# Print table of total deaths per month.
kable(total_deaths_month, 
      col.names = c("Month", "Total Deaths", "Percentage"),
      format = "html",
      digits = 2,
      caption = "Total Deaths and Percentage by Month") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(height = "200px")
Total Deaths and Percentage by Month
Month Total Deaths Percentage
January 1973713 10.49
February 1531031 8.14
March 1452402 7.72
April 1537652 8.17
May 1442009 7.66
June 1324227 7.04
July 1440256 7.65
August 1560359 8.29
September 1527357 8.12
October 1537261 8.17
November 1595771 8.48
December 1896740 10.08

Grouping by year and cause of death, calculating total deaths per year, and calculating the percentage of total deaths each year. This is for graphing purposes.

# Group by year and cause of death
cause_counts_year <- cause_of_death_data_clean %>%
  select(-c(`All Cause`, Month, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`, -Year) %>%
  group_by(Year, `Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  arrange(Year, desc(total_count))

# Calculate total deaths for each year
total_deaths_year <- cause_counts_year %>%
  group_by(Year) %>%
  summarise(total_deaths = sum(total_count))

# Calculate percentage of total deaths for each year
total_deaths_year <- total_deaths_year %>%
  mutate(percentage = total_deaths / sum(total_deaths) * 100)

Plot bar graph for total number of death per year.

# Plot bar graph for total number of deaths per year.
ggplot(total_deaths_year, aes(x = Year, y = total_deaths/1e6, fill = as.factor(Year))) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) +
  labs(title = "Total Deaths by Year",
       x = "Year",
       y = "Total Deaths") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")


Print table for total deaths per year.

# Print table for total deaths per year.
kable(total_deaths_year, 
      col.names = c("Year", "Total Deaths", "Percentage"),
      format = "html",
      digits = 2,
      caption = "Total Deaths and Percentage by Year") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(height = "200px")
Total Deaths and Percentage by Year
Year Total Deaths Percentage
2020 6326855 33.62
2021 6544637 34.78
2022 5947286 31.60

Grouping by month, year, and cause of death, calculating total deaths per year per month, and calculating the percentage of total deaths each year and month. This is for graphing purposes.

# Group by year, month, and cause of death
cause_counts_year_month <- cause_of_death_data_clean %>%
  select(-c(`All Cause`, `Number Of Days`)) %>%
  gather(key = "Cause of Death", value = "count", -`Jurisdiction of Occurrence`, -Year, -Month) %>%
  group_by(Year, Month, `Cause of Death`) %>%
  summarize(total_count = sum(count, na.rm = TRUE)) %>%
  arrange(Year, Month, desc(total_count))

# Calculate total deaths for each year and month
total_deaths_year_month <- cause_counts_year_month %>%
  group_by(Year, Month) %>%
  summarise(total_deaths = sum(total_count))

# Calculate percentage of total deaths for each year and month
total_deaths_year_month <- total_deaths_year_month %>%
  mutate(percentage = total_deaths / sum(total_deaths) * 100)

Plot stacked bar plot for causes of death per month per year.

# Create a stacked bar plot for causes of death by month and year
ggplot(cause_counts_year_month, aes(x = Month, y = total_count/1e6, fill = `Cause of Death`)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Year) +  # facet by year
  scale_y_continuous(labels = function(x) paste0(format(x, big.mark = ",", scientific = FALSE), " million"), 
                     breaks = pretty_breaks()) + # format y-axis labels
  labs(title = "Total Causes of Death by Month & Year",
       x = "Month",
       y = "Total Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5))


Print table for total deaths per month per year.

# Print table for total deaths per month per year.
kable(total_deaths_year_month, 
      col.names = c("Year", "Month", "Total Deaths", "Percentage"),
      format = "html",
      digits = 2,
      caption = "Total Deaths and Percentage by Year and Month") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(height = "200px")
Total Deaths and Percentage by Year and Month
Year Month Total Deaths Percentage
2020 1 458159 7.24
2020 2 424927 6.72
2020 3 476820 7.54
2020 4 640048 10.12
2020 5 532282 8.41
2020 6 453854 7.17
2020 7 519117 8.20
2020 8 513981 8.12
2020 9 465822 7.36
2020 10 501311 7.92
2020 11 585911 9.26
2020 12 754623 11.93
2021 1 774913 11.84
2021 2 545362 8.33
2021 3 495046 7.56
2021 4 466178 7.12
2021 5 462739 7.07
2021 6 431520 6.59
2021 7 457288 6.99
2021 8 584022 8.92
2021 9 616758 9.42
2021 10 568395 8.68
2021 11 535910 8.19
2021 12 606506 9.27
2022 1 740641 12.45
2022 2 560742 9.43
2022 3 480536 8.08
2022 4 431426 7.25
2022 5 446988 7.52
2022 6 438853 7.38
2022 7 463851 7.80
2022 8 462356 7.77
2022 9 444777 7.48
2022 10 467555 7.86
2022 11 473950 7.97
2022 12 535611 9.01

Stashed changes