DECIPHER - R Lesson #13

R Lesson #13 - Fitting parameters

This lesson showcases the optim function, which can be used to find the input parameter values that minimize the output of another function. The optim function works by adaptively changing the input parameters until it has optimized the objective function.

Every optimization has at least three parts: (1) a set of one or more input parameters stored inside a vector, (2) a score or metric for goodness-of-fit, and (3) a function that returns goodness-of-fit.

For the first example, the goal is to fit a line to a set of points. The equation for a line is y = m*x + b, where m and b are the parameters being optimized. Their optimal values should equal the coefficients found by lm(). Note that the sum-of-squared error decreases with each additional iteration.

Hide output

X <- 1:6
> Y <- c(12, 1, 44, 17, 95, 11) # randomly selected values
> plot(X, Y)
> 
> fitMe <- function(params) {
+    b <- params[1]
+    m <- params[2]
+    
+    points(X, m*X + b, type="l")
+    Sys.sleep(0.1) # slow it down to watch progress
+    
+    error <- sum((Y - (m*X + b))^2)
+    print(error)
+    return(error)
+ }
> 
> o <- optim(c(b=0, m=10), fitMe)
[1] 5516
[1] 5582
[1] 5917
[1] 5321
[1] 5237.5
[1] 5249.5
[1] 5256.219
[1] 5540
[1] 5307.5
[1] 5319.5
[1] 5256.875
[1] 5262.875
[1] 5244.969
[1] 5247.969
[1] 5242.451
[1] 5234.217
[1] 5230.281
[1] 5229.285
[1] 5230.532
[1] 5224.18
[1] 5223.279
[1] 5226.742
[1] 5224.447
[1] 5229.278
[1] 5224.903
[1] 5224.906
[1] 5223.81
[1] 5225.205
[1] 5223.613
[1] 5223.197
[1] 5223.466
[1] 5223.421
[1] 5223.201
[1] 5223.334
[1] 5223.171
[1] 5223.287
[1] 5223.151
[1] 5223.355
[1] 5223.147
[1] 5223.164
[1] 5223.147
[1] 5223.181
[1] 5223.143
> o$par # optimal parameters
       b        m 
5.002281 7.142624 
> 
> coef(lm(Y~X)) # same as o$par
(Intercept)           X 
   5.000000    7.142857

In the next example the goal is to find the x-value where a function is minimized. Here only one parameter, x, is being optimized.

it <- 0 # global variable
> cols <- rainbow(100)
> 
> f <- function(x, plot=TRUE) {
+    it <<- it + 1L # globally iterate
+    
+    # calculate the sum of three functions
+    y1 <- -50*(x - 2)^2 + 1
+    y2 <- abs(x) - 1
+    y3 <- 2*x^4 - 20*x^3
+    Y <- y1 + y2 + y3
+    
+    if (plot && it <= 100)
+       points(x, Y, col=cols[it])
+    
+    return(Y)
+ }
> 
> t <- seq(-10, 15, 0.01)
> plot(t, f(t, plot=FALSE), type="l")
> 
> o <- optim(0, # initial parameter value
+    f, # function to find minimum value
+    method="SANN") # optimization method
> 
> print(it) # number of iterations
[1] 10001
> points(o$par, f(o$par))

For the next part of this lesson, it is necessary to download the weather dataset available here. This dataset is in RData format, and was originally created using the save function in R. After loading the 'Weather' object, the data.frame contains weather information for Madison, Wisconsin going back to the year 1948.

load("<<PATH TO Madison_Weather.RData>>")
> 
> time <- Weather$Year - 1900 +
+    Weather$Day/365 +
+    Weather$Hour/(365*24) +
+    Weather$Min/(365*24*60)
> Weather <- cbind(Weather, Time=time)
> 
> max_points <- 1e5
> plot(Weather$Time[seq_len(max_points)],
+    Weather$Temp[seq_len(max_points)],
+    type="l",
+    ylab="Temperature (degrees Fahrenheit)",
+    xlab="Time")

Here, the objective is to fit a sine wave to Temp over time. A sine wave has four parameters: (1) amplitude, (2) phase, (3) frequency, and (4) offset. The goodness-of-fit function will be sum of squared error. Note that this requires optimizing multiple parameters ('params') simultaneously.

# sum of squared errors (SSE)
> SSE <- function(estimated, actual) {
+    sum((estimated - actual)^2,
+       na.rm=TRUE)
+ }
> 
> iteration <- 0L # used as a global variable
> cols <- rep(rainbow(100), 100) # colors to plot
> plot_sine <- function(params, ...) {
+    amp <- params[1]
+    phase <- params[2]
+    freq <- params[3]
+    off <- params[4]
+    
+    x <- Weather$Time[seq_len(max_points)]
+    y <- amp*sin(freq*(x - phase)) + off
+    points(x,
+       y,
+       type="l",
+       ...)
+ }
> 
> # function being passed to optim:
> goodFit <- function(params, plot=FALSE) {
+    iteration <<- iteration + 1L
+    
+    amp <- params[1]
+    phase <- params[2]
+    freq <- params[3]
+    off <- params[4]
+    
+    sine <- amp*sin(freq*(Weather$Time - phase)) + off
+    
+    error <- SSE(sine,
+       Weather$Temp)
+    
+    if (plot)
+       plot_sine(params,
+          lwd=0.5,
+          col=cols[iteration])
+    
+    return(error)
+ }
> 
> o <- optim(c(1, 1, 1, 1), # params
+    goodFit, # function
+    plot=TRUE) # function arguments
> 
> print(iteration)
[1] 79
> o # the output is a list
$par
[1] -25.48035 -27.69295  16.78735  38.93353


$value
[1] 510106022


$counts
function gradient 
      79       NA 


$convergence
[1] 10


$message
NULL


> 
> plot_sine(o$par,
+    col="darkred", lwd=3)

Note that optim did not converge because it is sensitive to the starting parameters, especially when the method is "Nelder-Mead" (the default). However, in this case the other methods also fail to find the optimal combination of parameters because the initial values are too far off. The best option is always to provide reasonable starting parameters and then let optim fine tune, as shown below.

# re-initialize the plot
> plot(Weather$Time[seq_len(max_points)],
+    Weather$Temp[seq_len(max_points)],
+    type="l",
+    ylab="Temperature (degrees Fahrenheit)",
+    xlab="Time")
> 
> iteration <- 0L # used as a global variable
> o <- optim(c(amp=20, # +/- 20 from mean temp
+    phase=0.3, # guess and check
+    freq=2*pi, # one cycle per year
+    off=50), # mean temp in Madison
+    goodFit,
+    plot=TRUE)
> 
> print(iteration)
[1] 419
> o
$par
       amp      phase       freq        off 
26.2572068  0.3034806  6.2834491 46.9456483 


$value
[1] 74467278


$counts
function gradient 
     419       NA 


$convergence
[1] 0


$message
NULL


> 
> plot_sine(o$par,
+    col="darkred", lwd=3)

For the rest of this lesson, it is necessary to download the zipped folder of images from here and unzip it. This directory contains a set of images taken every four hours of a bacterial colony growing. In order to read in the images one-by-one, it is first necessary to define the path to the unzipped directory. The average brightness of each image will be used as a proxy for the degree of bacterial growth.

# note that the final '/' is required:
> path <- "<<PATH TO ../single_colony/>>"
> images <- list.files(path) # list files
> head(images)
[1] "Test5S01_001_2.jpg" "Test5S01_002_2.jpg"
[3] "Test5S01_003_2.jpg" "Test5S01_004_2.jpg"
[5] "Test5S01_005_2.jpg" "Test5S01_006_2.jpg"
> 
> #install.packages("jpeg") # only need to run this once
> library(jpeg)
> intensity <- numeric(length(images))
> # loop though the images in the directory:
> for (i in seq_along(images)) {
+    img <- readJPEG(paste0(path,
+       images[i]))
+    intensity[i] <- mean(img)
+ }
> 
> x <- (0:(length(intensity) - 1))*4
> plot(x,
+    intensity,
+    ylab="Average Intensity",
+    xlab="Hours of Growth")

Now the goal is to fit a sigmoidal function to the points. A sigmoid is a s-shaped curve that can be obtained with a function of four parameters: (1) amplitude, (2) vertical offset, (3) slope, and (4) horizontal shift. As always, it is best to provide a reasonable set of starting values for optim, as determined below:

# parameters determined through guess-and-check
> offset <- min(intensity)
> amp <- diff(range(intensity))
> slope <- 5
> shift <- 4*which.min(abs(intensity - mean(intensity)))
> 
> # the sigmoidal function of four parameters
> y <- amp/(1 + exp(-slope*(log(x) - log(shift)))) + offset
> 
> points(x, y, type="l") # plot the initial curve
> 
> # optimize the parameters
> f <- function(params, plot=FALSE) {
+    offset <- params[1]
+    amp <- params[2]
+    slope <- params[3]
+    shift <- params[4]
+    
+    y <- amp/(1 + exp(-slope*(log(x) - log(shift)))) + offset
+    
+    error <- sum((intensity - y)^2) # SSE
+    
+    if (plot)
+       points(x, y, type="l", col="red")
+    
+    invisible(error)
+ }
> 
> # simultaneously optimize the parameters
> o <- optim(c(offset,
+       amp,
+       slope,
+       shift),
+    f)
> # display the parameters
> cat("offset =", o$par[1],
+    "\namp =", o$par[2],
+    "\nslope =", o$par[3],
+    "\nshift =", o$par[4])
offset = 0.1183825 
amp = 0.2141441 
slope = 3.971287 
shift = 120.7618
> 
> # plot the optimized sigmoidal
> f(o$par, plot=TRUE)
> legend("topleft",
+    c("Initial", "Optimal"),
+    lty=1,
+    col=c("black", "red"))

< Previous Lesson Next Lesson >