gDat <- read.delim("gapminderDataFiveYear.txt")
str(gDat)
## 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
## let's compute the difference between a variable's min and max
## use lifeExp or pop or gdpPercap
## functions to read help on min(), max(), range()
## student's work on this....
## get to know the functions mentioned above
min(gDat$lifeExp)
## [1] 23.6
max(gDat$lifeExp)
## [1] 82.6
range(gDat$lifeExp)
## [1] 23.6 82.6
## some solutions we might get
max(gDat$lifeExp) - min(gDat$lifeExp)
## [1] 59
with(gDat, max(lifeExp) - min(lifeExp))
## [1] 59
range(gDat$lifeExp)[2] - range(gDat$lifeExp)[1]
## [1] 59
with(gDat, range(lifeExp)[2] - range(lifeExp)[1])
## [1] 59
## let's package that little computation into a function
max_minus_min <- function(x) max(x) - min(x)
max_minus_min(gDat$lifeExp)
## [1] 59
## exercise:
## use your function on a sequence of numbers eg 1:5
max_minus_min(1:10)
## [1] 9
## use your function on some randomly generated numbers, eg runif(n)
max_minus_min(runif(1000))
## [1] 0.9975
## for functions you use alot or distribute, you might want to check the
## argument are valid!
## uncomment this during session
#max_minus_min("hello world")
## the max and min are extreme examples of a quantile
## median = 0.5 quantile
## 1st quartile = 0.25 quantile
## 3rd quartile = 0.75 quantile
## make the function more general:
## take the difference between two quantiles
## first, use the quantile function on lifeExp, pop, or gdpPercap to get the 0.5
## quantile or the 0.25 and 0.75 quantiles
## exercise in reading help and putting it to work
## what I expect students to get:
quantile(gDat$lifeExp)
## 0% 25% 50% 75% 100%
## 23.60 48.20 60.71 70.85 82.60
quantile(gDat$lifeExp, probs = 0.5)
## 50%
## 60.71
median(gDat$lifeExp)
## [1] 60.71
quantile(gDat$lifeExp, probs = c(0.25, 0.75))
## 25% 75%
## 48.20 70.85
## get the 0.25 and 0.75 AND take the difference between them
## exercise!
the_quantiles <- quantile(gDat$lifeExp, probs = c(0.25, 0.75))
max(the_quantiles) - min(the_quantiles)
## [1] 22.65
IQR(gDat$lifeExp) # hey, we've reinvented IQR
## [1] 22.65
## now package THAT as a function with arguments x and probs
quantile_diff <- function(x, probs) {
the_quantiles <- quantile(x, probs = probs)
return(max(the_quantiles) - min(the_quantiles))
}
## uncomment this during session
#quantile_diff(gDat$lifeExp) ## oops! must specify probs argument
quantile_diff(gDat$lifeExp, probs = c(0.25, 0.75))
## [1] 22.65
## can you use our new function to take the difference between the max a min?
quantile_diff(gDat$lifeExp, probs = c(0, 1))
## [1] 59
max_minus_min(gDat$lifeExp)
## [1] 59
with(gDat, max(lifeExp) - min(lifeExp))
## [1] 59
## what if you want probs to default to c(0, 1)?
quantile_diff <- function(x, probs = c(0, 1)) {
the_quantiles <- quantile(x, probs = probs)
return(max(the_quantiles) - min(the_quantiles))
}
quantile_diff(gDat$lifeExp) ## now this works!
## [1] 59
quantile_diff(gDat$lifeExp, probs = c(0.25, 0.75)) # this still works too
## [1] 22.65
## discuss a "to do" list for this function
## we're not checking that x is numeric
## we're not checking that probs is numeric
## we're not checking that probs is length 2
## the return value has a weird name
## etc etc
## long term
## build these sort of checks into functions
## and write unit tests to verify that functions do the right thing on weird
## data, edge cases, etc.
## optional, depending on time and interest
## demo of the special `...` argument in R
## read the help on quantil(), esp. argument `names =`
quantile_diff <- function(x, probs = c(0, 1), ...) {
the_quantiles <- quantile(x, probs = probs, ...)
return(max(the_quantiles) - min(the_quantiles))
}
quantile_diff(gDat$lifeExp) ## still works!
## [1] 59
quantile_diff(gDat$lifeExp, probs = c(0.25, 0.75)) # this still works too
## [1] 22.65
quantile_diff(gDat$lifeExp, names = FALSE) ## allows me to shut off names in
## [1] 59
## quantile()
## direct prep for data aggregation
## let's write a function that takes a data.frame
## like the Gapminder data
## or just the data for one country or continent or ...
## fits a linear model of lifeExp on year
## and returns the estimated intercept and slope
fit <- lm( lifeExp ~ year, data = gDat)
summary(fit)
##
## Call:
## lm(formula = lifeExp ~ year, data = gDat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.95 -9.65 1.70 10.33 22.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -585.6522 32.3140 -18.1 <2e-16 ***
## year 0.3259 0.0163 20.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.6 on 1702 degrees of freedom
## Multiple R-squared: 0.19, Adjusted R-squared: 0.189
## F-statistic: 399 on 1 and 1702 DF, p-value: <2e-16
## what's up with that crazy intercept?
## good time for a figure
library(ggplot2)
ggplot(gDat, aes(x = year, y = lifeExp)) + geom_point()
ggplot(gDat, aes(x = year, y = lifeExp)) + geom_point() +
geom_smooth(method = "lm")
## intercept = estimate life exp in year 0
## let's re-fit so intercept = est life exp in 1952 = earliest year in dataset
fit <- lm( lifeExp ~ I(year - 1952), data=gDat)
summary(fit) # much better
##
## Call:
## lm(formula = lifeExp ~ I(year - 1952), data = gDat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.95 -9.65 1.70 10.33 22.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.5121 0.5300 95.3 <2e-16 ***
## I(year - 1952) 0.3259 0.0163 20.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.6 on 1702 degrees of freedom
## Multiple R-squared: 0.19, Adjusted R-squared: 0.189
## F-statistic: 399 on 1 and 1702 DF, p-value: <2e-16
class(fit) ## read up on `lm` ... learn about coef()
## [1] "lm"
(fit.coef <- coef(fit))
## (Intercept) I(year - 1952)
## 50.5121 0.3259
names(fit.coef) <- c("intercept","slope")
fit.coef
## intercept slope
## 50.5121 0.3259
## package that into a function
## input: a data.frame
## output: a numeric vector of length two
## first element = estimated intercept
## second element = estimate slope
## names are "intercept" and "slope"
## GO!
jFun <- function(x) {
fit <- lm( lifeExp ~ I(year - 1952), data = x)
fit.coef <- coef(fit)
names(fit.coef) <- c("intercept","slope")
return(fit.coef)
}
jFun(gDat)
## intercept slope
## 50.5121 0.3259
## depending on time and interest, we could talk about better approaches to the
## "subtract 1952 from the year" problem
## what if we flexibility re: the shift?
## create a formal argument for the shift, but give it default value of 1952
jFun <- function(x, shift = 1952) {
fit <- lm( lifeExp ~ I(year - shift), data = x)
fit.coef <- coef(fit)
names(fit.coef) <- c("intercept","slope")
return(fit.coef)
}
jFun(gDat)
## intercept slope
## 50.5121 0.3259
jFun(gDat, shift = 0) ## can still get this if you really want
## intercept slope
## -585.6522 0.3259
jFun(gDat, shift = 2007) ## check against fitted line at 2007
## intercept slope
## 68.4368 0.3259
## what if we don't want to hard-wire 1952? another approach
jFun <- function(x, shift = NULL) {
if(is.null(shift)){
shift <- min(x$year)
}
fit <- lm( lifeExp ~ I(year - shift), data = x)
fit.coef <- coef(fit)
names(fit.coef) <- c("intercept","slope")
return(fit.coef)
}
jFun(gDat)
## intercept slope
## 50.5121 0.3259
jFun(gDat, shift = 0)
## intercept slope
## -585.6522 0.3259
jFun(gDat, shift = 2007)
## intercept slope
## 68.4368 0.3259
## exercise:
## create a subset of the data
## eg just one continent or one country
## plot the lifeExp against year and superpose a line
## use jFun() to get intercept and slope
## sanity check numbers against plot
x <- subset(gDat, continent == "Asia")
jFun(x)
## intercept slope
## 47.6040 0.4531
ggplot(x, aes(x = year, y = lifeExp)) +
geom_point() + geom_smooth(method = "lm")
x <- subset(gDat, country == "Rwanda")
jFun(x)
## intercept slope
## 42.74195 -0.04583
ggplot(x, aes(x = year, y = lifeExp)) +
geom_point() + geom_smooth(method = "lm")