The chart
Stata
R
Data
Sample dataset in .dta
format can be downloaded here. The R code uses the same dataset in .csv
format, which can be found here.
The code
Stata
*_______________________________________________________________________________
* Project : IDinsight data visualization guide - Stata graph templates
*
* Graph Type : Bar graphs with CIs, grouping outcomes over another variable
* Vertical and horizontal
* By : Crystal Huang
* Last edited : April 15, 2019
*_______________________________________________________________________________
* Toy dataset
global data "/Users/crystalhuang/Dropbox (IDinsight)/Data Visualization Guide/3. Toy Data"
global output "/Users/crystalhuang/Dropbox (IDinsight)/Data Visualization Guide/2. Graph templates/graphs"
use "$data/toydata.dta", clear
* Set IDinsight scheme and font
set scheme idinsight
graph set window fontface "Arial"
* Graph 2 - BMI, Blood Pressure, Diabetes, by region
* Define locals for outcomes
local outcomes anemic overweight bp_high
* Get standard errors, upper and lower CIs for each outcome var
foreach y of local outcomes {
g se_`y'= .
g lo_`y'= .
g hi_`y'= .
* loop over grouping var
forval i= 1/4 {
mean `y' if region== `i'
mat A= r(table)
local se_`i'= A[2,1]
local lo_`i'= A[5,1]
local hi_`i'= A[6,1]
replace se_`y'= `se_`i'' if region== `i'
replace lo_`y'= `lo_`i'' if region== `i'
replace hi_`y'= `hi_`i'' if region== `i'
}
}
* Collapse to get height of each bar (outcome variable mean) by grouping var (region)
collapse (mean) `outcomes' (first) se_* lo_* hi_*, by(region)
g id= _n
* Rename so outcomes have the same prefix for reshaping long
foreach y of local outcomes {
ren `y' mean_`y'
}
* Reshape so it's one row per outcome, per group
reshape long mean se lo hi, i(id) j(outcome) string
replace outcome = subinstr(outcome, "_", "", 1)
* Set bar label format, make units into percent
foreach var in mean se hi lo {
replace `var'= `var'*100
}
format mean %4.1f
* Create x-axis spacing
gsort outcome +mean
g x= region if outcome== "anemic"
replace x= region + 5 if outcome== "overweight"
replace x= region + 10 if outcome== "bp_high"
* GRAPH: Horizontal bars, grouped by region with CIs
#delimit;
twoway (bar mean x if region==1, horizontal barw(0.50))
(bar mean x if region==2, horizontal barw(0.50))
(bar mean x if region==3, horizontal barw(0.50))
(bar mean x if region==4, horizontal barw(0.50))
(rcap hi lo x, horizontal lwidth(thin) lcolor(black))
(scatter x hi, msym(none) mlab(mean) mlabpos(3) mlabgap(1) mlabsize(2) mlabcolor(black)),
xlabel(0(10)70, format(%4.0f))
ylabel(2.5 "Anemic" 7.5 `" "High" "Blood Pressure" "' 12.5 "Overweight", nogrid)
ytitle("")
xtitle("Percent %")
title("Health indicator prevalence, by region")
legend(region(lcolor(white)) order(1 "North-East" 2 "Mid-West" 3 "South" 4 "West") rows(4) pos(3))
xscale(lcolor(none)) ;
graph export "$output/bargraph_horizontal.tif", replace ;
#delimit cr
R
# Horizontal bar
################################# Initial Setup ################################
# Install required packages if they are not already in your system
packages <- c('tidyverse')
lapply(packages, function(i) {if(!i %in% installed.packages() == T)
{install.packages(i, dependencies = TRUE, repos='http://cran.rstudio.com/')}})
# Loading required packages
library("tidyverse")
# Setting working directory
setwd("~/Dropbox (IDinsight)/Data visualization library")
############################## Loading dataset #################################
mydata <- read_csv("Data/toydata.csv", show_col_types = FALSE)
############################### Data processing ################################
# Changing the structure of the dataset from wide to long
mydata_long <- mydata %>%
select(personid, region, bp_high, anemic, overweight) %>%
pivot_longer(cols = c(3:5),
names_to = "disease",
values_to = "disease_present") %>%
arrange(personid)
# Converting the regions to factor variable so that we can maintain a specific
# legend order. The levels provided below dictates the order of the bars and
# the legend.
mydata_long$region <- factor(mydata_long$region,
levels = c("North-East", "Mid-West",
"South", "West"))
# Creating the means and CIs
# Assuming 95% CI
alpha = 0.05
# Creating means and CI and storing in my_sum tibble (dataframe)
my_sum <- mydata_long %>%
group_by(region, disease) %>%
summarise(n = n(),
mean = mean(disease_present, na.rm = T),
sd = sd(disease_present, na.rm = T)) %>%
mutate(se = sd/sqrt(n)) %>%
mutate(ic = se * qt((1-alpha)/2 + 0.5, n - 1)) %>%
arrange(disease)
############################## Creating the graph ##############################
# Define axis and fill variable
x_axis <- my_sum$disease
y_axis <- my_sum$mean
fill_by <- my_sum$region
plot <- my_sum %>%
# Setting aesthetic which will be inherited by other geometric objects
ggplot(aes(x = x_axis, y = y_axis, fill = fill_by)) +
# "identity" selected for stat since we want the heights of the bars to
# represent values in the data. The default, "bin" makes the height of each
# bar equal to the number of cases in each group.
# position dodge applied so that the bars are not superimposed.
geom_bar(stat = "identity", position = position_dodge(width = 0.7),
width = 0.5) +
# Position dodge applied so that the error bars are not superimposed. The
# width can be tweaked to ensure that are positioned correctly on top of the
# bars
geom_errorbar(aes(ymin = mean - ic, ymax = mean + ic),
width = 0.15,
size = 0.5,
position = position_dodge(width = 0.7)) +
# Adding the text which will display the mean of the bars. They are positioned
# 0.035 units on top of (mean + ic). This ensures that there is some space
# between the error bars and the text.
# As with previous geometric objects, position dodge has been applied here as
# well so that the mean is not superimposed.
geom_text(aes(y = mean + ic + 0.035, label = formatC(mean*100, digits = 1,
format = "f")),
position = position_dodge(width = 0.7),
size = 2.8) +
# Setting the legend/scale colors
scale_fill_manual(values = c("#264D96", "#5480D6", "#A8BFEB", "#DEE5F7")) +
# Setting the x-axis labels. The expand option removes the space around the
# axis and the data. Please feel free to change the values and see how the
# plot is affected.
# \n is the new line character, used to create a line break
scale_x_discrete(labels = c("Anemic", "High Blood\n Pressure", "Overweight"),
expand = c(0, 0)) +
# Customizing the y-axis by changing the limits, axis tick values and labels
scale_y_continuous(expand = c(0, 0),
limits = c(0, max(my_sum$mean) + 0.2),
breaks = c(seq(from = 0, to = 0.7, by = 0.1)),
labels = c("0", "10", "20", "30", "40",
"50", "60", "70")) +
# Graph title and axis labels
labs(title = "Health indicator prevalence, by region",
y = "Percent %") +
# Flipping the coordinates to create horizontal bars
coord_flip()
############################# Formatting the graph #############################
plot +
theme_classic() +
# The following visual changes have been made:
# Removed y-axis title (flipped coordinates)
# Made the y-axis line a light shade of gray
# Removed the x-axis line
# Created x-axis grid lines which are light gray and dotted
# Removed axis ticks
# Removed legend title
# Vertically aligned and resized x-axis text
# Center aligned and resized the plot title
theme(text = element_text(family = "Inter"),
axis.title.y = element_blank(),
axis.line.y = element_line(color = "gray80"),
axis.line.x = element_blank(),
panel.grid.major.x = element_line(colour = "gray90", linetype = "dotted"),
axis.ticks = element_blank(),
legend.title = element_blank(),
axis.text.x = element_text(vjust = 0.5, size = 10),
plot.title = element_text(hjust = 0.5, size = 12))
############################# Saving and exporting #############################
# indicating the export folder and the image file name
export_folder <- "R/Bar graphs/Exports/"
img_name <- "bar_horizontal_R_reviewed.png"
ggsave(paste(export_folder,img_name,sep = ""))
Other details
Stata
Credit: Crystal Huang
You must have IDinsight styles installed to get replicate the chart above.
R
Code written by Arkadeep Bandyopadhyay and reviewed by Sandra Alemayehu.
Colors for the graph have been selected from IDinsight’s brand guide.