Hospital Analytics: US Best Hospitals

Suppose you are moving to a city in the US and want to live close to a hospital, is there a better way to find a good hospital than just looking at user reviews on Google? What about data from CMS (Centers for Medicare & Medicaid Services)?

The data for this Exploratory Data Analysis (EDA) is published at Kaggle (link.) In this EDA, we will find where hospitals with high rating scores in the US are.

As usual, let’s load libraries and data.

##### Library #####
library(tidyverse)
library(ggmap)
library(gridExtra)
library(ggcounty)
library(stringr)

data <- read.csv("C:/HospInfo.csv")

##### Library #####

library(tidyverse)

library(ggmap)

library(gridExtra)

library(ggcounty)

library(stringr)

data <- read.csv("C:/HospInfo.csv")

There are many ways to plot the data on the map such as ggmap. Since the dataset doesn’t give longitude and latitude coordinates, I will utilize the ZIP code through a package called “ggcounty” from (link) and FIPS county code from U.S. Census. Next, we will set up some functions to use throughout the EDA.

##### Arrange Function #####
reorder <- function(x) { factor(x, levels = names(sort(table(x), decreasing = TRUE)))
}

##### Theme Moma #####
theme_moma <- function(base_size = 12, base_family = "Helvetica") {
  theme(
    plot.background = element_rect(fill = "#F7F6ED"),
    legend.key = element_rect(fill = "#F7F6ED"),
    legend.background = element_rect(fill = "#F7F6ED"),
    panel.background = element_rect(fill = "#F7F6ED"),
    panel.border = element_rect(colour = "black", fill = NA, linetype = "dashed"),
    panel.grid.minor = element_line(colour = "#7F7F7F", linetype = "dotted"),
    panel.grid.major = element_line(colour = "#7F7F7F", linetype = "dotted")
  )
}

##### Arrange Function #####

reorder <- function(x) { factor(x, levels = names(sort(table(x), decreasing = TRUE)))

}

##### Theme Moma #####

theme_moma <- function(base_size = 12, base_family = "Helvetica") {

theme(

plot.background = element_rect(fill = "#F7F6ED"),

legend.key = element_rect(fill = "#F7F6ED"),

legend.background = element_rect(fill = "#F7F6ED"),

panel.background = element_rect(fill = "#F7F6ED"),

panel.border = element_rect(colour = "black", fill = NA, linetype = "dashed"),

panel.grid.minor = element_line(colour = "#7F7F7F", linetype = "dotted"),

panel.grid.major = element_line(colour = "#7F7F7F", linetype = "dotted")

)

}

Next, let’s process data.

##### Data Processing #####
#Tbl_ df
data <- tbl_df(data)

#Initialize Zipcode
data(zipcode)
zipcode <- tbl_df(zipcode)

#Clean Zip Code
data$ZIP.Code <- clean.zipcodes(data$ZIP.Code) #Correct those with leading 0
data <- rename(data, zip = ZIP.Code) #Rename for joining

##### Data Processing #####

#Tbl_ df

data <- tbl_df(data)

#Initialize Zipcode

data(zipcode)

zipcode <- tbl_df(zipcode)

#Clean Zip Code

data$ZIP.Code <- clean.zipcodes(data$ZIP.Code) #Correct those with leading 0

data <- rename(data, zip = ZIP.Code) #Rename for joining

As usual, let’s start with glimpse() .

glimpse(data)

1	glimpse(data)

Observations: 4,807
Variables: 29
$ Provider.ID   <int> 10001, 10005, 10006, 10007, 10008, 10011, 10012, 1001...
$ Hospital.Name <fctr> SOUTHEAST ALABAMA MEDICAL CENTER, MARSHALL MEDICAL C...
$ Address       <fctr> 1108 ROSS CLARK CIRCLE, 2505 U S HIGHWAY 431 NORTH, ...

Observations: 4,807

Variables: 29

$ Provider.ID <int> 10001, 10005, 10006, 10007, 10008, 10011, 10012, 1001...

$ Hospital.Name <fctr> SOUTHEAST ALABAMA MEDICAL CENTER, MARSHALL MEDICAL C...

$ Address <fctr> 1108 ROSS CLARK CIRCLE, 2505 U S HIGHWAY 431 NORTH, ...

There are 4,807 observations and 29 variables. Many of them are factors that are not yet sorted. Also, they use “Not Available.” So, I will sort the level of factor and change “Not Available” to NA.

#Change "Not Applicable" to true NA
data[data == "Not Available"] = NA

#Changing Order
data$Mortality.national.comparison <- ordered(
  data$Mortality.national.comparison, levels = c("Above the National average", 
                                                 "Same as the National average", 
                                                 "Below the National average"))

data$Safety.of.care.national.comparison <- ordered(
  data$Safety.of.care.national.comparison, levels = c("Above the National average", 
                                                      "Same as the National average", 
                                                      "Below the National average"))

data$Readmission.national.comparison <- ordered(
  data$Readmission.national.comparison, levels = c("Above the National average", 
                                                   "Same as the National average", 
                                                   "Below the National average"))

data$Patient.experience.national.comparison <- ordered(
  data$Patient.experience.national.comparison, levels = c("Above the National average", 
                                                          "Same as the National average", 
                                                          "Below the National average"))

data$Effectiveness.of.care.national.comparison <- ordered(
  data$Effectiveness.of.care.national.comparison, levels = c("Above the National average", 
                                                             "Same as the National average", 
                                                             "Below the National average"))

data$Timeliness.of.care.national.comparison <- ordered(
  data$Timeliness.of.care.national.comparison, levels = c("Above the National average", 
                                                          "Same as the National average", 
                                                          "Below the National average"))

data$Efficient.use.of.medical.imaging.national.comparison <- ordered(
  data$Efficient.use.of.medical.imaging.national.comparison, levels = c("Above the National average", 
                                                                        "Same as the National average", 
                                                                        "Below the National average"))

#Joining Lat Long
eda <- left_join(data, zipcode, by = "zip") #Join Lat and Lon
eda <- select(eda, -city, -state, -Location) #Delete city and states

#Change "Not Applicable" to true NA

data[data == "Not Available"] = NA

#Changing Order

data$Mortality.national.comparison <- ordered(

data$Mortality.national.comparison, levels = c("Above the National average",

"Same as the National average",