Getting started
Clear the environment
Load the following packages
library(knitr) library(dplyr) library(stringr) library(reshape2) library(magrittr)
May 1 2017
Getting started
Clear the environment
Load the following packages
library(knitr) library(dplyr) library(stringr) library(reshape2) library(magrittr)
We will use 2 built-in datasets for data visualisation
A set of samples with measurements of several traits
iris
contains petal and sepal measurements for irisesA time series
ChickWeight
contains weights for chicks on different dietsLoad and configure the dataset iris
data(iris) iris %<>% as_data_frame() # converts it to a newer data frame format iris
R
has plotting functions in the base package graphics
?plot() ?boxplot() ?hist()
And other plotting packages have been written
We will focus on the function ggplot()
ggplot2
ggplot2
is flexible and powerful
Has a unique approach and syntax
Consists of layers of plotting information
The main function in ggplot2
is ggplot()
library(ggplot2)
ggplot()
4 basic steps to plotting with ggplot()
:
aes()
= aesthetics The first value is the dataset to use
ggplot(data = iris)
is the same as
ggplot(iris)
We have a blank canvas ie the plot area
aes()
defines what is plotted, colour and shape etc
ggplot(iris, aes(x = Species, y = Petal.Length))
Now we have a grid and axes
Note that we have chosen to plot
- a numeric variable (Petal.Length
) vs
- a categorical variable (Species
)
This will constrain what geometries we can use
Add a geometry using various geom_...()
functions
ggplot(iris, aes(x = Species, y = Petal.Length)) + # note the + geom_point()
Colour the points by species
ggplot(iris, aes(x = Species, y = Petal.Length, colour = Species)) + geom_point()
We can modify the points in many ways eg
size()
shape()
alpha()
- transparency Add a main title
Clean up the y axis title
Remove the figure legend
ggplot(iris, aes(x = Species, y = Petal.Length, colour = Species)) + geom_point() + labs(title = "Iris Petal Length by Species", y = "Petal Length") + theme(plot.title = element_text(hjust = 0.5), legend.position = "none")
There are many options under theme()
The points are tightly clustered on the species line
We can stagger them using geom_jitter()
ggplot(iris, aes(x = Species, y = Petal.Length, colour = Species)) + geom_jitter() + labs(title = "Iris Petal Length by Species", y = "Petal Length") + theme(plot.title = element_text(hjust = 0.5), legend.position = "none")
Use geom_boxplot()
to display the data
ggplot(iris, aes(x = Species, y = Petal.Length, colour = Species)) + geom_boxplot() + labs(title = "Iris Petal Length by Species", y = "Petal Length") + theme(plot.title = element_text(hjust = 0.5), legend.position = "none")
We can plot more than one geometry onto the same plot
- geom_boxplot()
- geom_jitter()
ggplot(iris, aes(x = Species, y = Petal.Length, colour = Species)) + geom_boxplot(colour = "darkgrey", alpha = 0) + geom_jitter(alpha = 0.5) + labs(title = "Iris Petal Length by Species", y = "Petal Length") + theme(plot.title = element_text(hjust = 0.5), legend.position = "none")
Embed aes()
within each geom()
ggplot(iris) + geom_boxplot(aes(x = Species, y = Petal.Length), colour = "darkblue", alpha = 0) + geom_boxplot(aes(x = Species, y = Petal.Width), colour = "darkgreen", alpha = 0) + geom_boxplot(aes(x = Species, y = Sepal.Width), colour = "purple", alpha = 0) + geom_boxplot(aes(x = Species, y = Sepal.Length), colour = "darkred", alpha = 0) + labs(title = "Measurements by Iris Species", y = "Size") + theme(plot.title = element_text(hjust = 0.5))
We will now plot
- a numeric variable (Petal.Length
) vs
- another numeric variable (Species
)
Again this will constrain what geometries we can use
Start with geom_point()
ggplot(data = iris, aes(x = Sepal.Length, y = Petal.Length)) + geom_point()
ggplot(iris, aes(x = Sepal.Length, y = Petal.Length, colour = Species)) + geom_point() + labs(title = "Iris Petal Length vs Sepal Length", x = "Sepal Length", y = "Petal Length") + theme(plot.title = element_text(hjust = 0.5, face = "bold"))
Use geom_smooth()
Add line for each species
ggplot(iris, aes(x = Sepal.Length, y = Petal.Length, colour = Species)) + geom_point() + geom_smooth(method = "lm") + labs(title = "Iris Petal Length vs Sepal Length", x = "Sepal Length", y = "Petal Length") + theme(plot.title = element_text(hjust = 0.5, face="bold"))
Correlation between sepal and petal length
library(corrplot) cor(iris$Sepal.Length, iris$Petal.Length, use = "pairwise.complete.obs")
Correlation between all 4 numerical variables
iris correlations <- cor(iris %>% select(-Species), use = "pairwise.complete.obs") correlations
corrplot(correlations, method = "shade", shade.col = NA, tl.col = "black", tl.srt = 90, order = "AOE", mar=c(2,0,2,0))
With values
corrplot(correlations, method = "shade", shade.col = NA, tl.col = "black", tl.srt = 90, order = "AOE", mar=c(2,0,2,0), addCoef.col = "darkgrey")
Load and configure the dataset ChickWeight
And fix column names!
data(ChickWeight) ChickWeight %<>% as_data_frame() # newer data frame format ChickWeight colnames(ChickWeight) <- c("Weight", "Day", "Chick", "Diet") ChickWeight
DietCount <- ChickWeight %>% # take starting df select(Diet, Chick) %>% # keep 2 of the columns unique %>% # keep 1 copy of each combination count(Diet) %>% # count number of chicks by diet rename(Count = n) # rename the count column DietCount
Use geom_col()
ggplot(DietCount, aes(x = Diet, y = Count, fill = Diet)) + geom_col() + labs(title = "Number of chicks on each diet") + theme(plot.title = element_text(hjust = 0.5, face="bold"), legend.position = "none")
And again with diets as integers - this is cheating!
ggplot(DietCount, aes(x = Diet, y = Count, fill = as.integer(Diet))) + geom_col() + labs(title = "Number of chicks on each diet") + theme(plot.title = element_text(hjust = 0.5, face="bold"), legend.position = "none")
Use geom_histogram()
to get the distributions
Use facet_wrap()
to plot each day in its own plot
ggplot(ChickWeight, aes(x = Weight)) + geom_histogram(fill = "darkblue") + labs(title = "Distribution of Chick Weight by Day", y = "Count") + facet_wrap(~ Day, scales = "free") + theme(plot.title = element_text(hjust = 0.5, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
Back to geom_point()
to create a scatter plot
ggplot(ChickWeight, aes(x = Day, y = Weight, colour = Chick)) + geom_point() + labs(title = "Chick Weights by Day") + theme(plot.title = element_text(hjust = 0.5, face="bold"), legend.position = "none")
Use to geom_line()
instead of geom_point()
ggplot(ChickWeight, aes(x = Day, y = Weight, colour = Chick)) + geom_line() + labs(title = "Chick Weights by Day") + theme(plot.title = element_text(hjust = 0.5, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
Group by chick so the data from day to day remains connected But colour by diet
ggplot(ChickWeight, aes(x = Day, y = Weight, group = Chick, colour = Diet)) + geom_line() + labs(title = "Chick Weights by Day") + theme(plot.title = element_text(hjust = 0.5, face="bold"))
Want mean, SD and 95% CI to compare the diets
WeightSummary <- ChickWeight %>% group_by(Diet, Day) %>% summarise(N = length(Weight) %>% as.integer(), Mean = mean(Weight) %>% round(digits = 1), SD = sd(Weight) %>% round(digits = 1)) %>% mutate(MeanSD = paste(Mean, SD, sep = " ± ")) %>% mutate(CI95 = (qt(0.975,df = N-1)*SD/sqrt(N)) %>% round(digits = 1)) %>% mutate(MeanCI95 = paste(Mean, CI95, sep = " ± ")) WeightSummary
ggplot(WeightSummary, aes(x = Day, y = Mean, group = Diet, colour = Diet)) + geom_line() + labs(title = "Chick Weights over Day") + theme(plot.title = element_text(hjust = 0.5, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1))
The diets are not in order of increasing effectiveness
Reorder them
WeightSummary$Diet WeightSummary$Diet <- ordered(WeightSummary$Diet, levels = c("1", "2", "4", "3")) WeightSummary$Diet ggplot(WeightSummary, aes(x = Day, y = Mean, group = Diet, colour = Diet)) + geom_line() + labs(title = "Chick Weights over Day") + theme(plot.title = element_text(hjust = 0.5, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1))
Use geom_ribbon()
ggplot(WeightSummary, aes(x = Day, y = Mean, colour = Diet)) + geom_ribbon(aes(ymin = Mean-CI95, ymax = Mean+CI95, color = Diet), alpha = 0) + labs(title = "Chick Weights over Time") + theme(plot.title = element_text(hjust = 0.5, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1))
Go back to the data
Is your visual interpretation consistent?
Convert the data from long to wide format
WeightSummary2 <- WeightSummary %>% dcast(Diet ~ Day, value.var = "MeanCI95") WeightSummary2 %>% dim() WeightSummary2[,c(1,13)]