SDM2 : Varied Thrush - Model

Preparation

R library that provides bindings to GDAL library.

> install.packages("dismo")
> install.packages("caret")
> install.packages("InformationValue")
[3]:
library(raster)
library(dismo)
library(caret)
library(InformationValue)
[4]:
set.seed(30)

SDM

types - Correlative - Mechanistic

data - presence only - presence + absence

Species of Interest : Varied Thrush

The varied thrush breeds in western North America from Alaska to northern California.

It is migratory, with northern breeders moving south within or somewhat beyond the breeding range.

Other populations may only move altitudinally.

d4f16f4436dd4fe0a929b42e1b2f2a32

Read in data

[16]:
vath.data <- read.csv("./txt/vath_2004.csv")
vath.val <- read.csv("./txt/vath_VALIDATION.csv")
[3]:
head(vath.data)
A data.frame: 6 × 6
SURVEYIDTRANSECTPOINTVATHEASTINGNORTHING
<int><int><int><int><dbl><dbl>
114525116193059332.20173289.0
224525116195059142.22173151.8
334525116196058834.36173185.7
444525116199058754.24172876.0
554525116251059037.42181450.2
664525116254059336.71181389.1
[17]:
# split dataset by presence
vath.pres <- vath.data[vath.data$VATH==1,]
vath.abs <- vath.data[vath.data$VATH==0,]
vath.pres.xy <- as.matrix(vath.pres[,c("EASTING","NORTHING")])
vath.abs.xy <- as.matrix(vath.abs[,c("EASTING","NORTHING")])
[18]:
# validation dataset
vath.val.pres <- as.matrix(vath.val[vath.val$VATH==1, c("EASTING","NORTHING")])
vath.val.abs <- as.matrix(vath.val[vath.val$VATH==0, c("EASTING","NORTHING")])
vath.val.xy <- as.matrix(vath.val[,c("EASTING","NORTHING")])
[16]:
head(vath.val.pres)
A matrix: 6 × 2 of type dbl
EASTINGNORTHING
97257608.84260574.5
151 72212.39355337.3
152 72305.07355535.5
154156539.72354851.7
155156770.04354898.6
164 54244.40366721.0
[5]:
# env layers
elev <- raster("./R_db/elev.gri") # elevation
canopy <- raster("./R_db/cc2.gri") # canopy slope
mesic <- raster("./R_db/mesic.gri") # mesic forest
precip <- raster("./R_db/precip.gri") # precipitation
[23]:
#check maps
compareRaster(elev, canopy)
TRUE
[26]:
compareRaster(elev, mesic)
Error in compareRaster(elev, mesic): different extent
Traceback:

1. compareRaster(elev, mesic)
2. stop("different extent")
[6]:
elev
class      : RasterLayer
dimensions : 2083, 1643, 3422369  (nrow, ncol, ncell)
resolution : 200, 200  (x, y)
extent     : 19165, 347765, 164300, 580900  (xmin, xmax, ymin, ymax)
crs        : NA
source     : /home/user/SE_data/exercise/R_db/elev.grd
names      : elev_km
values     : 0, 3.079  (min, max)

[7]:
mesic
class      : RasterLayer
dimensions : 2050, 1586, 3251300  (nrow, ncol, ncell)
resolution : 210, 210  (x, y)
extent     : 16965, 350025, 153735, 584235  (xmin, xmax, ymin, ymax)
crs        : +proj=aea +lat_1=46 +lat_2=48 +lat_0=44 +lon_0=-109.5 +x_0=600000 +y_0=0 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs
source     : /home/user/SE_data/exercise/R_db/mesic.grd
names      : a_pmesic
values     : 0, 1  (min, max)

[8]:
precip
class      : RasterLayer
dimensions : 2152, 1664, 3580928  (nrow, ncol, ncell)
resolution : 200, 200  (x, y)
extent     : 16965, 349765, 153735, 584135  (xmin, xmax, ymin, ymax)
crs        : +proj=aea +lat_1=46 +lat_2=48 +lat_0=44 +lon_0=-109.5 +x_0=600000 +y_0=0 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs
source     : /home/user/SE_data/exercise/R_db/precip.grd
names      : ppt_cm
values     : 23, 287  (min, max)

Interpolation

23877087ef714cdea012aa1372cca298

[5]:
# resampling
mesic <- resample(x=mesic, y=elev, "ngb")
precip <- resample(x=precip, y=elev, "bilinear")
[6]:
#crop to same extent
mesic <- mask(mesic, elev)
precip <- mask(precip, elev)
[7]:
compareRaster(elev,precip, mesic)
TRUE
[10]:
# creat a forest layer at 1km resolution
fw.1km <- focalWeight(mesic, 1000, 'circle')
mesic1km <- focal(mesic, w=fw.1km, fun="mean", na.rm=T)
[42]:
#create raster stacck
layers <- stack(canopy, elev, mesic, mesic1km, precip)
names(layers) <- c("canopy", "elev", "mesic", "mesic1km", "precip")
[50]:
options(repr.plot.width=18, repr.plot.height=11)
plot(layers,cex.axis=2,cex.lab=2,cex.main=2.5,legend.args=list(text=NULL, side=1, cex.lab = 3, line=2.3))
../_images/CASESTUDY_SDM2_Vath_Rmodel_24_0.png
[13]:
options(repr.plot.width=18, repr.plot.height=16)
pairs(layers,maxpixels=1000,cex=0.5)
../_images/CASESTUDY_SDM2_Vath_Rmodel_25_0.png
[14]:
#drop correlated layer (mesic)

layers <- dropLayer(layers, 3)
[19]:
#Generate background points using
# 2000 was chosen for the illustration purpose.
# This number can be much larger in real practice.

back.xy <- randomPoints(layers, p=vath.pres.xy, n=2000)
colnames(back.xy) <- c("EASTING","NORTHING")
[74]:
head(back.xy)
A matrix: 6 × 2 of type dbl
xy
124065226200
152665317400
152465250400
139265258400
181465378800
59665423800
[20]:
options(repr.plot.width=9, repr.plot.height=9)
plot(elev)
points(back.xy)
../_images/CASESTUDY_SDM2_Vath_Rmodel_29_0.png
[21]:
#extract GIS data
pres.idpv <- extract(layers, vath.pres.xy)
back.idpv <- extract(layers, back.xy)
val.idpv <- extract(layers, vath.val.xy)
[22]:
#link data
df.pres <- data.frame(vath.pres.xy, pres.idpv, pres=1)
df.back <- data.frame(back.xy, back.idpv, pres=0)
df.val <- data.frame(vath.val, val.idpv)
[87]:
head(df.pres)
A data.frame: 6 × 7
EASTINGNORTHINGcanopyelevmesic1kmprecippres
<dbl><dbl><dbl><dbl><dbl><dbl><dbl>
22 95880.25191274.0 0.039087992.0330.1481481126.0001
38 90821.05210968.8-0.134644601.6820.3950617104.0001
39 90745.12210715.3 0.328752991.6650.4691358104.0001
42 90463.29209767.5 0.227024001.6760.7530864106.0001
43 90142.26209555.8 0.432407501.6590.8148148106.0001
48116258.21216962.9 0.594982681.4971.0000000 85.8751
[23]:
#remove any potential NAs
df.pres <- df.pres[complete.cases(df.pres),]
df.back <- df.back[complete.cases(df.back),]
df.val <- df.val[complete.cases(df.val),]
[24]:
# merge together
df.all <- rbind(df.pres, df.back)
[90]:
head(df.all)
A data.frame: 6 × 7
EASTINGNORTHINGcanopyelevmesic1kmprecippres
<dbl><dbl><dbl><dbl><dbl><dbl><dbl>
22 95880.25191274.0 0.039087992.0330.1481481126.0001
38 90821.05210968.8-0.134644601.6820.3950617104.0001
39 90745.12210715.3 0.328752991.6650.4691358104.0001
42 90463.29209767.5 0.227024001.6760.7530864106.0001
43 90142.26209555.8 0.432407501.6590.8148148106.0001
48116258.21216962.9 0.594982681.4971.0000000 85.8751
[25]:
# data transformation
# Scaling and centering the environmental variables to zero mean and variance of 1
predictors <- c("canopy","elev","mesic1km","precip")
df.all[,predictors] <- scale(df.all[,predictors])
[94]:
head(df.all)
A data.frame: 6 × 7
EASTINGNORTHINGcanopyelevmesic1kmprecippres
<dbl><dbl><dbl><dbl><dbl><dbl><dbl>
22 95880.25191274.0-0.13922201.5492040-0.609419480 0.85361111
38 90821.05210968.8-0.64508000.7652292 0.009843172 0.25394661
39 90745.12210715.3 0.70419680.7272587 0.195621968 0.25394661
42 90463.29209767.5 0.40799210.7518278 0.907774019 0.30846161
43 90142.26209555.8 1.00600810.7138576 1.062589682 0.30846161
48116258.21216962.9 1.47937900.3520229 1.527036672-0.24009511

Model Fitting

generalized linear model

  1. An exponential family of probability distributions.

  2. A linear predictor \(\displaystyle \eta =X\beta\)

  3. A link function \({\displaystyle g}\) such that \({\displaystyle E(Y\mid X)=\mu =g^{-1}(\eta )}\)

[26]:
mdl.vath <- glm(pres~canopy+elev+I(elev^2)+mesic1km+precip, family=binomial(link=logit), data=df.all)
[27]:
summary(mdl.vath)

Call:
glm(formula = pres ~ canopy + elev + I(elev^2) + mesic1km + precip,
    family = binomial(link = logit), data = df.all)

Deviance Residuals:
    Min       1Q   Median       3Q      Max
-0.7757  -0.3423  -0.2126  -0.1371   3.5321

Coefficients:
            Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.85830    0.17404 -16.424  < 2e-16 ***
canopy       0.19543    0.09928   1.968  0.04901 *
elev        -0.72618    0.27404  -2.650  0.00805 **
I(elev^2)   -0.97856    0.24709  -3.960 7.49e-05 ***
mesic1km     0.39188    0.15197   2.579  0.00992 **
precip       0.49281    0.16762   2.940  0.00328 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 773.28  on 2093  degrees of freedom
Residual deviance: 674.77  on 2088  degrees of freedom
AIC: 686.77

Number of Fisher Scoring iterations: 8

Model Evaluation

[28]:
df.val[,predictors] <- scale(df.val[,predictors])
[29]:
df.val$pred <- predict(mdl.vath,df.val[,predictors],type="response")
[30]:
head(df.val)
A data.frame: 6 × 13
X.1XTRANSECTSTOPYEARVATHEASTINGNORTHINGcanopyelevmesic1kmprecippred
<int><int><int><int><int><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
111464811437220070209685.9325750.0 0.74425510-0.5121329-0.1800319-1.13506120.0381429149
222464811437320070209845.0325979.0 0.44215351-0.5299351-0.2678732-1.13506120.0346827307
333464811437620070210406.8326674.1 0.50654501-0.4705947-0.7070799-1.10453300.0305994483
444454311415320070230797.4201486.2-0.92568323 1.5855534-1.0877256-0.43198740.0006819783
555454311415420070231006.4201499.1-0.60895621 1.5291802-1.0877256-0.43198740.0008973454
666454311415620070231491.6201433.0 0.01492895 1.7784102-1.0877256-0.32097580.0003989799

confusion matrix

725c5faee3334c86a16a50fbc56d06f5

[31]:
cutoff <- optimalCutoff(df.val$VATH, df.val$pred)
[32]:
df.val[which(df.val$pred>=cutoff),]
A data.frame: 1 × 13
X.1XTRANSECTSTOPYEARVATHEASTINGNORTHINGcanopyelevmesic1kmprecippred
<int><int><int><int><int><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
155515571557471011616122008187299.98374993.71.695836-0.265871.283991.7512420.2617735
[33]:
confusionMatrix(df.val$VATH,df.val$pred,threshold=cutoff)
A data.frame: 2 × 2
01
<int><int>
01763148
1 0 1
[129]:
specificity(df.val$VATH,df.val$pred,threshold=cutoff)
1
[130]:
sensitivity(df.val$VATH,df.val$pred,threshold=cutoff)
0.00671140939597315
[140]:
# Calculate the percentage misclassification error for the given actuals and probaility scores.
misClassError(df.val$VATH,df.val$pred,threshold=cutoff)
0.0774

ROC curves

51d1a7e157ee47cab0385b05d436c123

[141]:
plotROC(df.val$VATH,df.val$pred)
../_images/CASESTUDY_SDM2_Vath_Rmodel_53_0.png

Discussions

  1. HW vs. Project

  2. Future course topics

    • thematic subjects : eg. GLM, Bayesian, etc.

    • data techniques : eg. transformations, sampling, etc.

    • case studies

    • others ?

Feedback : Please Provide input via slack channel by 4pm Apr 8. (EDT)

References

[ ]: