pRactice corner: Practice questions

lruolin

The codes below are all taken from websites listed in the Reference Section. This post is to let me practice on the past exam questions.

Load required packages

library(tidyverse)

Brendan Cullen’s website

https://tidyverse-exam-v2-solutions.netlify.app/

Basic Operations

# Read in person.csv and store result in a tibble called person

person <- read_csv("https://tidyverse-exam-v2-solutions.netlify.app/person.csv") 

person

# A tibble: 5 x 3
  person_id personal_name family_name
  <chr>     <chr>         <chr>      
1 dyer      William       Dyer       
2 pb        Frank         Pabodie    
3 lake      Anderson      Lake       
4 roe       Valentina     Roerich    
5 danforth  Frank         Danforth

# Create a tibble containing only family name and personal names.

person %>% 
  select(family_name, personal_name)

# A tibble: 5 x 2
  family_name personal_name
  <chr>       <chr>        
1 Dyer        William      
2 Pabodie     Frank        
3 Lake        Anderson     
4 Roerich     Valentina    
5 Danforth    Frank

# Create a new tibble containing only the rows in which family names come before the letter M. 

person %>% 
  filter(family_name < "M")

# A tibble: 3 x 3
  person_id personal_name family_name
  <chr>     <chr>         <chr>      
1 dyer      William       Dyer       
2 lake      Anderson      Lake       
3 danforth  Frank         Danforth

# Display all the rows in person sorted by family name length with the longest name first

person

# A tibble: 5 x 3
  person_id personal_name family_name
  <chr>     <chr>         <chr>      
1 dyer      William       Dyer       
2 pb        Frank         Pabodie    
3 lake      Anderson      Lake       
4 roe       Valentina     Roerich    
5 danforth  Frank         Danforth

person %>% 
  arrange(desc(str_length(family_name)))

# A tibble: 5 x 3
  person_id personal_name family_name
  <chr>     <chr>         <chr>      
1 danforth  Frank         Danforth   
2 pb        Frank         Pabodie    
3 roe       Valentina     Roerich    
4 dyer      William       Dyer       
5 lake      Anderson      Lake

Cleaning and counting

# Read the file measurement.csv to create a tibble called measurements

measurements <- read_csv("https://tidyverse-exam-v2-solutions.netlify.app/measurements.csv")

glimpse(measurements)

Rows: 21
Columns: 4
$ visit_id <dbl> 619, 619, 622, 622, 734, 734, 734, 735, 735, 735, 7…
$ visitor  <chr> "dyer", "dyer", "dyer", "dyer", "pb", "lake", "pb",…
$ quantity <chr> "rad", "sal", "rad", "sal", "rad", "sal", "temp", "…
$ reading  <dbl> 9.82, 0.13, 7.80, 0.09, 8.41, 0.05, -21.50, 7.22, 0…

# Create a tibble containing only rows where none of the values are NA and save in a tibble called cleaned

cleaned <- measurements %>% 
  drop_na()

cleaned

# A tibble: 18 x 4
   visit_id visitor quantity reading
      <dbl> <chr>   <chr>      <dbl>
 1      619 dyer    rad         9.82
 2      619 dyer    sal         0.13
 3      622 dyer    rad         7.8 
 4      622 dyer    sal         0.09
 5      734 pb      rad         8.41
 6      734 lake    sal         0.05
 7      734 pb      temp      -21.5 
 8      735 pb      rad         7.22
 9      751 pb      rad         4.35
10      751 pb      temp      -18.5 
11      752 lake    rad         2.19
12      752 lake    sal         0.09
13      752 lake    temp      -16   
14      752 roe     sal        41.6 
15      837 lake    rad         1.46
16      837 lake    sal         0.21
17      837 roe     sal        22.5 
18      844 roe     rad        11.2

# Count the number of measurements of each type of quantity in cleaned. Your result should have one row for each quantity "rad", "sal", "temp"

cleaned %>% 
  group_by(quantity) %>% 
  summarise(n = n())

# A tibble: 3 x 2
  quantity     n
  <chr>    <int>
1 rad          8
2 sal          7
3 temp         3

# Display the minimum and maximum value or reading separately for each quantity in cleaned

cleaned %>% 
  group_by(quantity) %>% 
  summarise(min_reading = min(reading),
            max_reading = max(reading))

# A tibble: 3 x 3
  quantity min_reading max_reading
  <chr>          <dbl>       <dbl>
1 rad             1.46        11.2
2 sal             0.05        41.6
3 temp          -21.5        -16

# Create a tibble in which all salinity readings greater than 1 are divided by 100.

cleaned %>% 
  mutate(reading = case_when(
    quantity == "sal" & reading > 1 ~ reading/100,
    T ~ reading
  ))

# A tibble: 18 x 4
   visit_id visitor quantity reading
      <dbl> <chr>   <chr>      <dbl>
 1      619 dyer    rad        9.82 
 2      619 dyer    sal        0.13 
 3      622 dyer    rad        7.8  
 4      622 dyer    sal        0.09 
 5      734 pb      rad        8.41 
 6      734 lake    sal        0.05 
 7      734 pb      temp     -21.5  
 8      735 pb      rad        7.22 
 9      751 pb      rad        4.35 
10      751 pb      temp     -18.5  
11      752 lake    rad        2.19 
12      752 lake    sal        0.09 
13      752 lake    temp     -16    
14      752 roe     sal        0.416
15      837 lake    rad        1.46 
16      837 lake    sal        0.21 
17      837 roe     sal        0.225
18      844 roe     rad       11.2

Combining Data

# Read visited.csv and drop rows containing any NAs, assigning the results to a new tibble called visited

visited <- read_csv("https://tidyverse-exam-v2-solutions.netlify.app/visited.csv") %>% 
  drop_na()

visited

# A tibble: 7 x 3
  visit_id site_id visit_date
     <dbl> <chr>   <date>    
1      619 DR-1    1927-02-08
2      622 DR-1    1927-02-10
3      734 DR-3    1930-01-07
4      735 DR-3    1930-01-12
5      751 DR-3    1930-02-26
6      837 MSK-4   1932-01-14
7      844 DR-1    1932-03-22

# Use an innerjoin to combine visited with cleaned using visit_id for match
cleaned

# A tibble: 18 x 4
   visit_id visitor quantity reading
      <dbl> <chr>   <chr>      <dbl>
 1      619 dyer    rad         9.82
 2      619 dyer    sal         0.13
 3      622 dyer    rad         7.8 
 4      622 dyer    sal         0.09
 5      734 pb      rad         8.41
 6      734 lake    sal         0.05
 7      734 pb      temp      -21.5 
 8      735 pb      rad         7.22
 9      751 pb      rad         4.35
10      751 pb      temp      -18.5 
11      752 lake    rad         2.19
12      752 lake    sal         0.09
13      752 lake    temp      -16   
14      752 roe     sal        41.6 
15      837 lake    rad         1.46
16      837 lake    sal         0.21
17      837 roe     sal        22.5 
18      844 roe     rad        11.2

combined <- visited %>% 
  inner_join(cleaned, by = "visit_id")

combined

# A tibble: 14 x 6
   visit_id site_id visit_date visitor quantity reading
      <dbl> <chr>   <date>     <chr>   <chr>      <dbl>
 1      619 DR-1    1927-02-08 dyer    rad         9.82
 2      619 DR-1    1927-02-08 dyer    sal         0.13
 3      622 DR-1    1927-02-10 dyer    rad         7.8 
 4      622 DR-1    1927-02-10 dyer    sal         0.09
 5      734 DR-3    1930-01-07 pb      rad         8.41
 6      734 DR-3    1930-01-07 lake    sal         0.05
 7      734 DR-3    1930-01-07 pb      temp      -21.5 
 8      735 DR-3    1930-01-12 pb      rad         7.22
 9      751 DR-3    1930-02-26 pb      rad         4.35
10      751 DR-3    1930-02-26 pb      temp      -18.5 
11      837 MSK-4   1932-01-14 lake    rad         1.46
12      837 MSK-4   1932-01-14 lake    sal         0.21
13      837 MSK-4   1932-01-14 roe     sal        22.5 
14      844 DR-1    1932-03-22 roe     rad        11.2

# find the highest rad reading at each site. 

max_rad <- combined %>% 
  filter(quantity == "rad") %>% 
  group_by(site_id) %>% 
  summarise(max_rad = max(reading))

# Find the date of the highest radiation reading at each site

combined %>% 
  filter(quantity == "rad") %>% 
  group_by(site_id, visit_date) %>% 
  summarize(max_rad = max(reading)) %>% 
  semi_join(max_rad) %>%  # returns all rows from x with a match in y
  select(visit_date, everything())

# A tibble: 3 x 3
# Groups:   site_id [3]
  visit_date site_id max_rad
  <date>     <chr>     <dbl>
1 1932-03-22 DR-1      11.2 
2 1930-01-07 DR-3       8.41
3 1932-01-14 MSK-4      1.46

Functional Programming

# Write a function called summarize_table that takes a title string and a tibble as input and returns a string that says something like "title has n rows and n columns

summarize_table <- function(title, df) {
  nrow <- nrow(df)
  ncol <- ncol(df)
  
  glue::glue("{title} has {nrow} rows and {ncol} columns." )
}

summarize_table("mtcars", mtcars)

mtcars has 32 rows and 11 columns.

# Write another function called show_columns that takes a string and a tibble as input and returns a string that says something like, “table has columns name, name, name”. For example, show_columns('person', person) should return the string "person has columns person_id, personal_name, family_name".

show_columns <- function(title, df) { 
  col_names <- names(df) %>% 
    str_c(collapse = ", ")

glue::glue("{title} has columns {col_names}")
}

show_columns('person', person)

person has columns person_id, personal_name, family_name

# The function long_name checks whether a string is longer than 4 characters. Use this function and a function from purrr to create a logical vector that contains the value TRUE where family names in the tibble person are longer than 4 characters, and FALSE where they are 4 characters or less.


long_name <- function(name) {
      stringr::str_length(name) > 4
}

person %>% 
  mutate(long_family_name = map_lgl(family_name, long_name))

# A tibble: 5 x 4
  person_id personal_name family_name long_family_name
  <chr>     <chr>         <chr>       <lgl>           
1 dyer      William       Dyer        FALSE           
2 pb        Frank         Pabodie     TRUE            
3 lake      Anderson      Lake        FALSE           
4 roe       Valentina     Roerich     TRUE            
5 danforth  Frank         Danforth    TRUE

Marly Gotti’s website

https://marlycormar.github.io/tidyverse_sample_exam/sample_exam_sols/sols.html

Q1

The file at_health_facilities.csv contains a tidy dataset with four columns:

The ISO3 code of the country that reported data. The year for which data was reported. The percentage of HIV-positive children born to HIV-positive mothers age 15–17. The percentage of HIV-positive children born to HIV-positive mothers age 20–34. Please answer the following questions:

How many countries reported data? What is the difference between the minimum and maximum year with valid data for each country? How many countries reported data in 3 or more years? Which countries reported 100% incidence for at least one year in either age group?

hiv <- read_csv("https://education.rstudio.com/blog/2020/02/instructor-certification-exams/at_health_facilities.csv") %>% 
  janitor::clean_names()

glimpse(hiv)

Rows: 225
Columns: 4
$ iso3      <chr> "AFG", "ALB", "ALB", "ARG", "ARM", "ARM", "ARM", "…
$ year      <dbl> 2010, 2005, 2008, 2012, 2000, 2005, 2010, 2006, 20…
$ age_15_17 <dbl> 33, 98, 98, 100, 93, 99, 100, 81, 12, 16, 19, 31, …
$ age_20_34 <chr> "29", "96", "98", "100", "87", "100", "100", "87",…

# how many countries reported data?

hiv %>% 
  select(iso3) %>% 
  unique() %>% # 100 countries
  nrow()

[1] 100

# what is the difference between min and max year with valid data for each country?

# age 20-34 has some dashes to be filtered out

hiv %>% 
  filter(age_20_34 != "-") %>% 
  group_by(iso3) %>% 
  summarise(diff = max(year) - min(year))

# A tibble: 100 x 2
   iso3   diff
   <chr> <dbl>
 1 AFG       0
 2 ALB       3
 3 ARG       0
 4 ARM      10
 5 AZE       0
 6 BDI       5
 7 BEN      10
 8 BFA       7
 9 BGD       8
10 BIH       5
# … with 90 more rows

# how many countries reported data in 3 or more years?

hiv %>% 
  group_by(iso3) %>% 
  summarise(years_count = n_distinct(year)) %>% 
  filter(years_count >=3) # 34 countries

# A tibble: 34 x 2
   iso3  years_count
   <chr>       <int>
 1 ARM             3
 2 BEN             3
 3 BFA             3
 4 BGD             6
 5 CMR             3
 6 COD             3
 7 COL             3
 8 DOM             3
 9 EGY             5
10 ETH             3
# … with 24 more rows

# which countries reported 100% incidence rate for at least one year in either age group?

hiv %>% 
  filter(age_15_17 == 100 | age_20_34 == 100) %>% 
  distinct(iso3) # 18 countries

# A tibble: 18 x 1
   iso3 
   <chr>
 1 ARG  
 2 ARM  
 3 BRB  
 4 BLR  
 5 BIH  
 6 CUB  
 7 DOM  
 8 JAM  
 9 KAZ  
10 KGZ  
11 MKD  
12 MDA  
13 MNE  
14 LCA  
15 SRB  
16 THA  
17 UKR  
18 URY

Q2

A student has sent you the file rmd-country-profile.Rmd, which is an R Markdown document analyzing the data in at_health_facilities.csv for Bangladesh. They could not knit the file, and are providing you with the raw .Rmd file instead of a rendered file.

Go through the file, fixing things that are preventing it from knitting cleanly. Change the two lines of bold text to H2-level headers to organize the document, and add a table of contents. Convert this R Markdown report for Bangladesh into a parameterized report with the country’s iso3 code as its parameter. Knit a new country profile for Egypt (ISO3 code “EGY”).

Q3

infant_hiv <- read_csv("https://education.rstudio.com/blog/2020/02/instructor-certification-exams/infant_hiv.csv")

# tidy the data into 3 col: iso3, year, state, number

tidy <- infant_hiv %>% 
  pivot_longer(!ISO3,
               names_to = c("year", "state"),
               names_pattern = "(.*) (.*)") %>% 
  mutate(value = case_when(
    value == "-" | value == ">95%" ~NA_character_,
    TRUE ~ str_replace(value, pattern = "%", replacement = "")
  ))

tidy

# A tibble: 5,184 x 4
   ISO3  year  state value
   <chr> <chr> <chr> <chr>
 1 AFG   2009  est   <NA> 
 2 AFG   2009  hi    <NA> 
 3 AFG   2009  lo    <NA> 
 4 AFG   2010  est   <NA> 
 5 AFG   2010  hi    <NA> 
 6 AFG   2010  lo    <NA> 
 7 AFG   2011  est   <NA> 
 8 AFG   2011  hi    <NA> 
 9 AFG   2011  lo    <NA> 
10 AFG   2012  est   <NA> 
# … with 5,174 more rows

# write the function

tidy_data <- function(file) {
  
  # Import data
  raw_data <- read_csv(file)
  
  # Tidy data
  tidy <- raw_data %>% 
  pivot_longer(!ISO3,
               names_to = c("year", "state"),
               names_pattern = "(.*) (.*)") %>% 
  mutate(value = case_when(
    value == "-" | value == ">95%" ~NA_character_,
    TRUE ~ str_replace(value, pattern = "%", replacement = "")
  ))

tidy
}

tidy_data("https://education.rstudio.com/blog/2020/02/instructor-certification-exams/infant_hiv.csv")

# A tibble: 5,184 x 4
   ISO3  year  state value
   <chr> <chr> <chr> <chr>
 1 AFG   2009  est   <NA> 
 2 AFG   2009  hi    <NA> 
 3 AFG   2009  lo    <NA> 
 4 AFG   2010  est   <NA> 
 5 AFG   2010  hi    <NA> 
 6 AFG   2010  lo    <NA> 
 7 AFG   2011  est   <NA> 
 8 AFG   2011  hi    <NA> 
 9 AFG   2011  lo    <NA> 
10 AFG   2012  est   <NA> 
# … with 5,174 more rows

Q4

ranking <- read_csv("https://education.rstudio.com/blog/2020/02/instructor-certification-exams/ranking.csv")

ranking %>% 
  group_by(item) %>% 
  count(rank) %>% 
  pivot_wider(names_from = rank,
              values_from = n) %>% 
  mutate(num = sum(positive, negative, indifferent, na.rm = T)) %>% 
  mutate_at(vars(-item, -num), .funs = list(~ round(./num, digits = 2))) %>% 
  ggplot(aes(negative, positive, size = num)) +
  geom_point(aes(alpha = 0.25)) +
  geom_smooth(method = "lm")

References