Appendix

GEOG246-346

1 Module 1 practice answers

N/A

2 Module 2 practice answers

N/A

3 Module 3 practice answers

3.1 Practice 1

3.1.1 Questions

Class of a is integer.
Because the vector is contained in a list, and using a single [] to pull out an element returns a 1-element list, so you have to use list notation to get the vector.
a is vector of integers, l is a list of containing 2 integer vectors and 1 character vector.
We applied a function that is the fourth element of list l2 to the integer vector f that is the second element of l2.

3.1.2 Code

a <- 20:30
b <- letters

names(a) <- b[1:length(a)]
a

l <- list(a = a, b = b)

a[a >= 26]
a[c(1, 7)]
a[c(length(a) - 1, length(a))]

b[b %in% c("a", "c", "g")]

l[[1]]
l[1]
l$a[l$a < 25]
l[["a"]][l[["a"]] == 25]
l[[2]][l[[2]] %in% c("d", "e", "f")]

3.2 Practice 2

3.2.1 Questions

The vector is coerced to character.
The whole matrix is coerced to character. If data.frame, only m$b is coerced to character.
1. Have to use both column and row indices to attract row, col subset; b) Can use list notation for data.frame (e.g. m$b), but not for matrix.

3.2.2 Code

m <- cbind(1:10, 11:20, 21:30)

m[4:5, 2:3]

colnames(m) <- letters[1:3]

m[m[, "b"] > 14 & m[, "b"] <= 18, ]

d <- as.data.frame(m)

d[d$a > 4, "a"] <- -1

d$c <- letters[1:10]

l <- list(m, d)
l[[2]][2:3, ]

d %>% filter(b >= 14 & b <= 18)

3.3 Practice 3

3.3.1 Questions

1 and 5 pulled from b and multiplied in sequence on m by row then column.
Would multiply 1st columns of each, 2nd columns of each, and 3rd columsn of each.
d row 1 columns 1 to 3 multiplied by the value from the last row, first column of m.

3.3.2 Code

Not shown here due to length

sin(m)
cos(m)

rowSums(d[, 1:3])
colSums(d[, 1:3])
rowMeans(d[, 1:3])
colMeans(d[, 1:3])

3.4 Practice 4

3.4.1 Code

It adds an NA to print statement for the 5th letter of the alphabet. Expand sscript by adding another “th” to the vector, so it has 5 elements.

sscript <- c("st", "nd", "rd", "th", "th")  # vector of superscripts  
for(i in 1:5) {  # for loop with iterator i over vector 1:4
  stmnt <- paste0(letters[i], " is the ", i, sscript[i],
                  " letter in the alphabet")
  print(stmnt)  # print statement
}

Turns 4 of the letters red
Adds axes to plots

sscript <- c("st", "nd", "rd", "th")  # vector of superscripts  
par(mfrow = c(1, 4), mar = c(0, 0, 1, 0.5))
for(i in 1:4) {  
  stmnt <- paste0(letters[i], " is the ", i, sscript[i],
                  " letter in the alphabet")
  plot(1:4, rep(3, 4), ylim = c(1, 5), pch = letters[1:4], #axes = FALSE,
       xlab = "", ylab = "", main = stmnt, cex = 2) 
  # points(i, 3, pch = letters[i], col = "red", cex = 2)
}

# Chunk 26
for(i in 1:10) {
  if(i < 5) {  # condition 1
    print(paste(i, "is less than", i + 1))
  } else if(i >= 3 & i <= 8) {  # condition 2
    print(paste(i, "is between", i - 1, "and", i + 1))
  } else {  # remaining conditions
    print(paste(i, "is greater than", i - 1))
  }
}

Create a for loop that iterates over a vector 1:20. Insert a condition into it such that it only prints out a result when the iterator’s value is 11

for(i in 1:20) {
  if(i == 11) print(i)
}

3.5 Practice 5

3.5.1 Code

Turns the summary of “a” variables to NA.

dat_list <- list(data.frame(a = 1:10, b = 21:30), 
                 data.frame(a = 31:40, b = 41:50),
                 data.frame(a = 51:60, b = 61:70))
dat_modify <- function(x) {
  x[1:3, 1] <- 999
  return(x)
}
dat_list <- lapply(dat_list, dat_modify)
dat_list

dat_list2 <- c(dat_list, mean)  # add another element to dat_list
lapply(1:length(dat_list2), function(x) {  # x <- 1
  d <- dat_list2[[x]]  # extract element of list
  if(is.data.frame(d)) {  # check if it is a data.frame
    d[d == 999] <- NA  # convert any 999 values to NA 
    o <- c(colSums(d, na.rm = FALSE), # column sums, dropping NAs
           "total" = sum(d, na.rm = TRUE)) #  sum dropping NAs
  } else { # if it is not a data.frame, make an error statement
    o <- paste("Operation not valid for a", class(d))  
  }
  return(o)  # return result
})

Can’t coerce list to double.

flist <- list(mean, sd, range)
lapply(1:3, function(x) flist[[x]](dat_list[[1]]))

lapply(dat_list, function(x) mean(unlist(x)))
lapply(dat_list, function(x) sd(unlist(x)))

sapply(dat_list, function(x) mean(unlist(x)))
sapply(dat_list, function(x) sd(unlist(x)))

lapply(dat_list, function(x) sum(x[1:2, 2]))

4 Module 4 practice answers

4.1 Practice 1

4.1.1 Questions

A tibble is an enhanced data.frame. Among other features, it provides more information on data types it contains when printing. It can be treated exactly like a data.frame though, in terms of indexing and other operations.
Base methods: tb_a$a; tb_a[["a"]]; tb_a[, "a"]. tidyverse, with tb_a %>% dplyr::select(a). Note that that gives back a tibble (or data.frame, if you have a data.frame). To get a vector back, use tb_a %>% dplyr::pull(a)
The data are messy, because the “column headers are values, not variable names” (see here). We would gather the data in the month rows, setting the month as the key and the mean flu cases as the value.
inner_join preserves just the values in x and drops non-matching rows from y. left_join fills non-matching values in y with NAs. right_join preserves values in y dropping non-matching values in x. full_join preserves all non-matching rows in both x and y, filling NAs into non-matched rows.

4.1.2 Code

set.seed(1)
dat <- tibble(a = sample(1:10), b = rnorm(10))
td <- "/path/where/you/want/to/write" # REPLACE THIS WITH YOUR OWN!!!!
readr::write_csv(dat, path = file.path(td, "dummy.csv"))

td <- "/path/where/you/want/to/write" # REPLACE THIS WITH YOUR OWN!!!!
readr::read_csv(file.path(td, "dummy.csv"))

Recreate tibbles first

# Chunk 13
set.seed(1)
t1 <- tibble(v1 = paste0("N", 1:5), v2 = rnorm(5))
t2 <- tibble(v1 = paste0("N", 1:5), v3 = runif(5))
t3 <- tibble(v1 = paste0("N", 1:7), v4 = sample(1:100, 7))
             # v5 = letters[sample(1:26, 7)])
t4 <- tibble(v1 = paste0("N", c(1:2, 4:7, 11)), 
             v5 = letters[sample(1:26, 7)])

Then do joins:

left_join(t1, t2) %>% left_join(., t3) %>% left_join(., t4)
right_join(t1, t2) %>% right_join(., t3) %>% right_join(., t4)

left_join(t1, t2) %>% left_join(., t3) %>% left_join(., t4) %>% arrange(v5)
right_join(t1, t2) %>% right_join(., t3) %>% right_join(., t4) %>% 
  arrange(desc(v5))

fs <- dir(system.file("extdata/", package = "geospaar"), pattern = "FAOSTAT", 
          full.names = TRUE)
crops <- lapply(fs, readr::read_csv)
crops_df <- do.call(rbind, lapply(crops, function(x) {
  x %>% dplyr::select(Item, Area, Element, Year, Value) %>% 
    pivot_wider(names_from = Element, values_from = Value) %>% 
    rename(crop = Item, country = Area, year = Year, 
           harv_area = `Area harvested`, prod = Production)
}))
crop_ylds <- crops_df %>% mutate(yield = prod / harv_area)
crop_ylds <- crop_ylds %>%     
  mutate(country = ifelse(country == "South Africa", "ZAF", country)) %>%  
  mutate(country = ifelse(country == "Zambia", "ZMB", country)) %>% 
  mutate(harv_km2 = harv_area / 100)

crop_ylds %>% rename(harv_area_km2 = harv_km2)

my_tb <- tibble(v1 = 1:10, v2 = 11:20) %>% 
  rbind(., tibble(v1 = 11:20, v2 = 21:30)) %>% mutate(v3 = v2^2) %>% 
  arrange(-v3)

my_tb %>% slice(1, 10, 17) %>% dplyr::select(v2, v3)

4.2 Practice 2: Analysis

4.2.1 Questions

dplyr::filter
group_by(crop, country, y2k) is doing the splitting, on crop type, then country, and then year. summarize(...) is doing the apply using a mean. There is no combine line, as it is implicit.
Chunk 30 adds a filter for crop type (selecting out maize), and then simply groups on the y2k variable.
It doesn’t work when the output of the analysis is not tabular/a list, as with cor.test and lm. We can overcome this by 1) creating individual functions that reproduce the component outputs of the analysis (e.g. Chunk 33) and these as a list of functions using funs to summarise_all, 2) doing the splits outside of the pipeline (e.g. Chunk 36), or 3) using functions such as do and broom::tidy within the pipeline (e.g. Chunk 39).

4.2.2 Code

crop_ylds %>% filter(crop == "sorghum" & country == "ZAF" & year >= 2000)
crop_ylds[crop_ylds$crop == "sorghum" & crop_ylds$country == "ZAF" & 
            crop_ylds$year >= 2000, ]

crop_ylds %>% filter(crop == "sorghum" & country == "ZAF" & year >= 2000) %>% 
  select(prod, harv_area, yield) %>% summarise_all(funs(mean, sd))

crop_ylds %>% group_by(crop, country) %>% select(prod, harv_area) %>%
  summarise_all(funs(mean, sd))

crop_ylds %>% filter(crop == "maize" & country == "ZMB") %>% 
  select(yield, harv_area) %>% cor()

dat <- crop_ylds %>% filter(crop == "maize" & country == "ZMB")
cor.test(dat$harv_area, dat$yield)

crop_ylds %>% filter(crop == "maize") %>% summarize(mu_yld = mean(yield))
# 2.07 t/ha

South Africa shows larger yields gains (0.061 t/ha/yr versus 0.03 t/ha/yr)

summary(lm(yield ~ year, 
           data = crop_ylds %>% filter(crop == "maize" & country == "ZMB")))
summary(lm(yield ~ year, 
           data = crop_ylds %>% filter(crop == "maize" & country == "ZAF")))

crop_ylds %>% filter(crop == "maize" & country == "ZMB") %>% 
  lm(yield ~ year, data = .) %>% summary() 
crop_ylds %>% filter(crop == "maize" & country == "ZAF") %>% 
  lm(yield ~ year, data = .) %>% summary()

crop_ylds %>% filter(crop != "sorghum") %>% group_by(crop, country) %>% 
  do(prod_ha_lm = lm(yield ~ year, data = .)) %>% 
  broom::tidy(., prod_ha_lm)

4.3 Practice 3: Visualization

4.3.1 Questions

ggplot2 is built from grid graphics, and is based on an underlying visualization philosophy. It builds up graphics objects using the + operator, and easily does splits within the data using the “color” argument within aes and/or the facet_grid function. graphics plots can be faster to implement for exploratory analysis, ggplot2 has more attractive, presentation-grade defaults.
Because the syntax used in graphics plots is used in many of the plotting functions developed for spatial packages, including newer ones such as sf and stars.
Each of the three plots takes the axis labels exactly as they are specified to the axis arguments (either as they were specified in the formula in the case of Chunk 40 or to the “x” and “y” arguments in Chunk 41). You can change the names using the “xlab” and “ylab” arguments.
Using “col”, “pch”, and “cex” arguments.
You have to add the . to the “data” argument of plot, e.g. dat %>% plot(y ~ x, data = .)
You get just an empty grey background–ggplot won’t plot anything without a geom_* function added to the ggplot object.

4.3.2 Code

crop_ylds %>% filter(crop == "sorghum") %>%
  ggplot() + geom_histogram(aes(x = yield), bins = 15) +
  ggtitle("Distribution of sorghum yields")
crop_ylds %>% filter(crop == "sorghum") %>%
  ggplot() + geom_histogram(aes(x = yield), bins = 15, fill = "red") +
  ggtitle("Distribution of sorghum yields")

crop_ylds %>% filter(crop == "wheat" & country == "ZAF") %>% 
  plot(harv_area ~ year, data = ., pch = 16, col = "blue", 
       xlab = "", ylab = "Harvested area (ha)", 
       main = "South Africa wheat (1961-2017)")
crop_ylds %>% filter(crop == "wheat" & country == "ZAF") %>% 
  plot(harv_area ~ year, data = ., pch = 16, type = "l", col = "blue", 
       xlab = "", ylab = "Harvested area (ha)", 
       main = "South Africa wheat (1961-2017)")

crop_ylds %>% filter(crop == "wheat" & country == "ZAF") %>% 
  ggplot() + geom_point(aes(year, harv_area), col = "blue") + 
  xlab("") + ylab("Harvested area (ha)") +  
  ggtitle("South Africa wheat (1961-2017)")
crop_ylds %>% filter(crop == "wheat" & country == "ZAF") %>% 
  ggplot() + geom_line(aes(year, harv_area), col = "blue") + 
  xlab("") + ylab("Harvested area (ha)") +  
  ggtitle("South Africa wheat (1961-2017)")

crop_ylds %>% filter(crop == "wheat") %>% 
  ggplot() + geom_line(aes(year, harv_area, color = country)) +
  scale_color_manual(values = c("red", "blue")) + 
  xlab("") + ylab("Harvested area (ha)") +  
  ggtitle("Wheat (1961-2017)")

# extra
crop_ylds %>% filter(crop == "wheat") %>% 
  ggplot() + geom_line(aes(year, log10(harv_area), color = country)) +
  scale_color_manual(values = c("red", "blue")) + 
  xlab("") + ylab("Harvested area (ha)") +  
  ggtitle("Wheat (1961-2017)")

crop_ylds %>% filter(crop == "wheat" & country == "ZAF") %>% 
  ggplot() + geom_point(aes(year, harv_area)) +
  geom_smooth(aes(year, harv_area)) +
  xlab("") + ylab("Harvested area (ha)") +  
  ggtitle("South African wheat (1961-2017)")

# ggplot2
crop_ylds %>% filter(crop == "wheat" & country == "ZMB") %>% 
  ggplot() + 
  geom_histogram(aes(x = harv_area), bins = 10, col = "black", fill = "blue") +
  xlab("Harvested area (ha)") + ggtitle("Zambian Wheat (1961-2017)")

# hist
# with dplyr
crop_ylds %>% filter(crop == "wheat" & country == "ZMB") %>% pull(harv_area) %>% 
  hist(., main = "Zambian Wheat (1961-2017)", xlab = "Harvested area (ha)", 
       col = "blue")
# with base subsetting
hist(crop_ylds$harv_area[crop_ylds$crop == "wheat" & 
                           crop_ylds$country == "ZMB"], 
     main = "Zambian Wheat (1961-2017)", xlab = "Harvested area (ha)", 
     col = "blue")

crop_ylds %>% filter(country == "ZAF") %>% 
  ggplot() + geom_point(aes(x = year, y = harv_area)) +
  geom_smooth(aes(x = year, y = harv_area)) + 
  facet_grid(cols = vars(crop)) +
  scale_color_manual(values = c("red", "blue")) + 
  ylab("Yield (tons/ha)") + xlab("")

Back to home