gDat <- read.delim("gapminderDataFiveYear.txt")
str(gDat)
## 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
## let's compute the difference between a variable's min and max
## use lifeExp or pop or gdpPercap
## functions to read help on min(), max(), range()

## student's work on this....

## get to know the functions mentioned above
min(gDat$lifeExp)
## [1] 23.6
max(gDat$lifeExp)
## [1] 82.6
range(gDat$lifeExp)
## [1] 23.6 82.6
## some solutions we might get
max(gDat$lifeExp) - min(gDat$lifeExp)
## [1] 59
with(gDat, max(lifeExp) - min(lifeExp))
## [1] 59
range(gDat$lifeExp)[2] - range(gDat$lifeExp)[1]
## [1] 59
with(gDat, range(lifeExp)[2] - range(lifeExp)[1])
## [1] 59
## let's package that little computation into a function
max_minus_min <- function(x) max(x) - min(x)
max_minus_min(gDat$lifeExp)
## [1] 59
## exercise:
## use your function on a sequence of numbers eg 1:5
max_minus_min(1:10)
## [1] 9
## use your function on some randomly generated numbers, eg runif(n)
max_minus_min(runif(1000))
## [1] 0.9975
## for functions you use alot or distribute, you might want to check the
## argument are valid!
## uncomment this during session
#max_minus_min("hello world")

## the max and min are extreme examples of a quantile
## median = 0.5 quantile
## 1st quartile = 0.25 quantile
## 3rd quartile = 0.75 quantile

## make the function more general:
## take the difference between two quantiles

## first, use the quantile function on lifeExp, pop, or gdpPercap to get the 0.5
## quantile or the 0.25 and 0.75 quantiles

## exercise in reading help and putting it to work
## what I expect students to get:
quantile(gDat$lifeExp)
##    0%   25%   50%   75%  100% 
## 23.60 48.20 60.71 70.85 82.60
quantile(gDat$lifeExp, probs = 0.5)
##   50% 
## 60.71
median(gDat$lifeExp)
## [1] 60.71
quantile(gDat$lifeExp, probs = c(0.25, 0.75))
##   25%   75% 
## 48.20 70.85
## get the 0.25 and 0.75 AND take the difference between them
## exercise!
the_quantiles <- quantile(gDat$lifeExp, probs = c(0.25, 0.75))
max(the_quantiles) - min(the_quantiles)
## [1] 22.65
IQR(gDat$lifeExp) # hey, we've reinvented IQR
## [1] 22.65
## now package THAT as a function with arguments x and probs
quantile_diff <- function(x, probs) {
  the_quantiles <- quantile(x, probs = probs)
  return(max(the_quantiles) - min(the_quantiles))
}

## uncomment this during session
#quantile_diff(gDat$lifeExp) ## oops! must specify probs argument
quantile_diff(gDat$lifeExp, probs = c(0.25, 0.75))
## [1] 22.65
## can you use our new function to take the difference between the max a min?
quantile_diff(gDat$lifeExp, probs = c(0, 1))
## [1] 59
max_minus_min(gDat$lifeExp)
## [1] 59
with(gDat, max(lifeExp) - min(lifeExp))
## [1] 59
## what if you want probs to default to c(0, 1)?
quantile_diff <- function(x, probs = c(0, 1)) {
  the_quantiles <- quantile(x, probs = probs)
  return(max(the_quantiles) - min(the_quantiles))
}

quantile_diff(gDat$lifeExp) ## now this works!
## [1] 59
quantile_diff(gDat$lifeExp, probs = c(0.25, 0.75)) # this still works too
## [1] 22.65
## discuss a "to do" list for this function
## we're not checking that x is numeric
## we're not checking that probs is numeric
## we're not checking that probs is length 2
## the return value has a weird name
## etc etc

## long term
## build these sort of checks into functions
## and write unit tests to verify that functions do the right thing on weird
## data, edge cases, etc.

## optional, depending on time and interest
## demo of the special `...` argument in R

## read the help on quantil(), esp. argument `names =`
quantile_diff <- function(x, probs = c(0, 1), ...) {
  the_quantiles <- quantile(x, probs = probs, ...)
  return(max(the_quantiles) - min(the_quantiles))
}
quantile_diff(gDat$lifeExp) ## still works!
## [1] 59
quantile_diff(gDat$lifeExp, probs = c(0.25, 0.75)) # this still works too
## [1] 22.65
quantile_diff(gDat$lifeExp, names = FALSE) ## allows me to shut off names in
## [1] 59
                                           ## quantile()

## direct prep for data aggregation

## let's write a function that takes a data.frame
## like the Gapminder data
## or just the data for one country or continent or ...
## fits a linear model of lifeExp on year
## and returns the estimated intercept and slope

fit <- lm( lifeExp ~ year, data = gDat)
summary(fit)
## 
## Call:
## lm(formula = lifeExp ~ year, data = gDat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39.95  -9.65   1.70  10.33  22.16 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -585.6522    32.3140   -18.1   <2e-16 ***
## year           0.3259     0.0163    20.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.6 on 1702 degrees of freedom
## Multiple R-squared:  0.19,   Adjusted R-squared:  0.189 
## F-statistic:  399 on 1 and 1702 DF,  p-value: <2e-16
## what's up with that crazy intercept?
## good time for a figure

library(ggplot2)
ggplot(gDat, aes(x = year, y = lifeExp)) + geom_point()

plot of chunk unnamed-chunk-1

ggplot(gDat, aes(x = year, y = lifeExp)) + geom_point() +
  geom_smooth(method = "lm")

plot of chunk unnamed-chunk-1

## intercept = estimate life exp in year 0
## let's re-fit so intercept = est life exp in 1952 = earliest year in dataset
fit <- lm( lifeExp ~ I(year - 1952), data=gDat)
summary(fit) # much better
## 
## Call:
## lm(formula = lifeExp ~ I(year - 1952), data = gDat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39.95  -9.65   1.70  10.33  22.16 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     50.5121     0.5300    95.3   <2e-16 ***
## I(year - 1952)   0.3259     0.0163    20.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.6 on 1702 degrees of freedom
## Multiple R-squared:  0.19,   Adjusted R-squared:  0.189 
## F-statistic:  399 on 1 and 1702 DF,  p-value: <2e-16
class(fit) ## read up on `lm` ... learn about coef()
## [1] "lm"
(fit.coef <- coef(fit))
##    (Intercept) I(year - 1952) 
##        50.5121         0.3259
names(fit.coef) <- c("intercept","slope")
fit.coef 
## intercept     slope 
##   50.5121    0.3259
## package that into a function
## input: a data.frame
## output: a numeric vector of length two
##        first element = estimated intercept
##        second element = estimate slope
##        names are "intercept" and "slope"
## GO!

jFun <- function(x) {
  fit <- lm( lifeExp ~ I(year - 1952), data = x)
  fit.coef <- coef(fit)
  names(fit.coef) <- c("intercept","slope")
  return(fit.coef)
}

jFun(gDat)
## intercept     slope 
##   50.5121    0.3259
## depending on time and interest, we could talk about better approaches to the
## "subtract 1952 from the year" problem

## what if we flexibility re: the shift?
## create a formal argument for the shift, but give it default value of 1952
jFun <- function(x, shift = 1952) {
  fit <- lm( lifeExp ~ I(year - shift), data = x)
  fit.coef <- coef(fit)
  names(fit.coef) <- c("intercept","slope")
  return(fit.coef)
}

jFun(gDat)
## intercept     slope 
##   50.5121    0.3259
jFun(gDat, shift = 0) ## can still get this if you really want
## intercept     slope 
## -585.6522    0.3259
jFun(gDat, shift = 2007) ## check against fitted line at 2007
## intercept     slope 
##   68.4368    0.3259
## what if we don't want to hard-wire 1952? another approach
jFun <- function(x, shift = NULL) {
  if(is.null(shift)){
    shift <- min(x$year)
  }
  fit <- lm( lifeExp ~ I(year - shift), data = x)
  fit.coef <- coef(fit)
  names(fit.coef) <- c("intercept","slope")
  return(fit.coef)
}

jFun(gDat)
## intercept     slope 
##   50.5121    0.3259
jFun(gDat, shift = 0)
## intercept     slope 
## -585.6522    0.3259
jFun(gDat, shift = 2007)
## intercept     slope 
##   68.4368    0.3259
## exercise:

## create a subset of the data
## eg just one continent or one country

## plot the lifeExp against year and superpose a line
## use jFun() to get intercept and slope
## sanity check numbers against plot

x <- subset(gDat, continent == "Asia")
jFun(x)
## intercept     slope 
##   47.6040    0.4531
ggplot(x, aes(x = year, y = lifeExp)) +
  geom_point() + geom_smooth(method = "lm")

plot of chunk unnamed-chunk-1

x <- subset(gDat, country == "Rwanda")
jFun(x)
## intercept     slope 
##  42.74195  -0.04583
ggplot(x, aes(x = year, y = lifeExp)) +
  geom_point() + geom_smooth(method = "lm")

plot of chunk unnamed-chunk-1