Practice questions for R Studio Exams
The codes below are all taken from websites listed in the Reference Section. This post is to let me practice on the past exam questions.
https://tidyverse-exam-v2-solutions.netlify.app/
# Read in person.csv and store result in a tibble called person
person <- read_csv("https://tidyverse-exam-v2-solutions.netlify.app/person.csv")
person
# A tibble: 5 x 3
person_id personal_name family_name
<chr> <chr> <chr>
1 dyer William Dyer
2 pb Frank Pabodie
3 lake Anderson Lake
4 roe Valentina Roerich
5 danforth Frank Danforth
# Create a tibble containing only family name and personal names.
person %>%
select(family_name, personal_name)
# A tibble: 5 x 2
family_name personal_name
<chr> <chr>
1 Dyer William
2 Pabodie Frank
3 Lake Anderson
4 Roerich Valentina
5 Danforth Frank
# Create a new tibble containing only the rows in which family names come before the letter M.
person %>%
filter(family_name < "M")
# A tibble: 3 x 3
person_id personal_name family_name
<chr> <chr> <chr>
1 dyer William Dyer
2 lake Anderson Lake
3 danforth Frank Danforth
# Display all the rows in person sorted by family name length with the longest name first
person
# A tibble: 5 x 3
person_id personal_name family_name
<chr> <chr> <chr>
1 dyer William Dyer
2 pb Frank Pabodie
3 lake Anderson Lake
4 roe Valentina Roerich
5 danforth Frank Danforth
person %>%
arrange(desc(str_length(family_name)))
# A tibble: 5 x 3
person_id personal_name family_name
<chr> <chr> <chr>
1 danforth Frank Danforth
2 pb Frank Pabodie
3 roe Valentina Roerich
4 dyer William Dyer
5 lake Anderson Lake
# Read the file measurement.csv to create a tibble called measurements
measurements <- read_csv("https://tidyverse-exam-v2-solutions.netlify.app/measurements.csv")
glimpse(measurements)
Rows: 21
Columns: 4
$ visit_id <dbl> 619, 619, 622, 622, 734, 734, 734, 735, 735, 735, 7…
$ visitor <chr> "dyer", "dyer", "dyer", "dyer", "pb", "lake", "pb",…
$ quantity <chr> "rad", "sal", "rad", "sal", "rad", "sal", "temp", "…
$ reading <dbl> 9.82, 0.13, 7.80, 0.09, 8.41, 0.05, -21.50, 7.22, 0…
# Create a tibble containing only rows where none of the values are NA and save in a tibble called cleaned
cleaned <- measurements %>%
drop_na()
cleaned
# A tibble: 18 x 4
visit_id visitor quantity reading
<dbl> <chr> <chr> <dbl>
1 619 dyer rad 9.82
2 619 dyer sal 0.13
3 622 dyer rad 7.8
4 622 dyer sal 0.09
5 734 pb rad 8.41
6 734 lake sal 0.05
7 734 pb temp -21.5
8 735 pb rad 7.22
9 751 pb rad 4.35
10 751 pb temp -18.5
11 752 lake rad 2.19
12 752 lake sal 0.09
13 752 lake temp -16
14 752 roe sal 41.6
15 837 lake rad 1.46
16 837 lake sal 0.21
17 837 roe sal 22.5
18 844 roe rad 11.2
# Count the number of measurements of each type of quantity in cleaned. Your result should have one row for each quantity "rad", "sal", "temp"
cleaned %>%
group_by(quantity) %>%
summarise(n = n())
# A tibble: 3 x 2
quantity n
<chr> <int>
1 rad 8
2 sal 7
3 temp 3
# Display the minimum and maximum value or reading separately for each quantity in cleaned
cleaned %>%
group_by(quantity) %>%
summarise(min_reading = min(reading),
max_reading = max(reading))
# A tibble: 3 x 3
quantity min_reading max_reading
<chr> <dbl> <dbl>
1 rad 1.46 11.2
2 sal 0.05 41.6
3 temp -21.5 -16
# Create a tibble in which all salinity readings greater than 1 are divided by 100.
cleaned %>%
mutate(reading = case_when(
quantity == "sal" & reading > 1 ~ reading/100,
T ~ reading
))
# A tibble: 18 x 4
visit_id visitor quantity reading
<dbl> <chr> <chr> <dbl>
1 619 dyer rad 9.82
2 619 dyer sal 0.13
3 622 dyer rad 7.8
4 622 dyer sal 0.09
5 734 pb rad 8.41
6 734 lake sal 0.05
7 734 pb temp -21.5
8 735 pb rad 7.22
9 751 pb rad 4.35
10 751 pb temp -18.5
11 752 lake rad 2.19
12 752 lake sal 0.09
13 752 lake temp -16
14 752 roe sal 0.416
15 837 lake rad 1.46
16 837 lake sal 0.21
17 837 roe sal 0.225
18 844 roe rad 11.2
# Read visited.csv and drop rows containing any NAs, assigning the results to a new tibble called visited
visited <- read_csv("https://tidyverse-exam-v2-solutions.netlify.app/visited.csv") %>%
drop_na()
visited
# A tibble: 7 x 3
visit_id site_id visit_date
<dbl> <chr> <date>
1 619 DR-1 1927-02-08
2 622 DR-1 1927-02-10
3 734 DR-3 1930-01-07
4 735 DR-3 1930-01-12
5 751 DR-3 1930-02-26
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
# Use an innerjoin to combine visited with cleaned using visit_id for match
cleaned
# A tibble: 18 x 4
visit_id visitor quantity reading
<dbl> <chr> <chr> <dbl>
1 619 dyer rad 9.82
2 619 dyer sal 0.13
3 622 dyer rad 7.8
4 622 dyer sal 0.09
5 734 pb rad 8.41
6 734 lake sal 0.05
7 734 pb temp -21.5
8 735 pb rad 7.22
9 751 pb rad 4.35
10 751 pb temp -18.5
11 752 lake rad 2.19
12 752 lake sal 0.09
13 752 lake temp -16
14 752 roe sal 41.6
15 837 lake rad 1.46
16 837 lake sal 0.21
17 837 roe sal 22.5
18 844 roe rad 11.2
combined <- visited %>%
inner_join(cleaned, by = "visit_id")
combined
# A tibble: 14 x 6
visit_id site_id visit_date visitor quantity reading
<dbl> <chr> <date> <chr> <chr> <dbl>
1 619 DR-1 1927-02-08 dyer rad 9.82
2 619 DR-1 1927-02-08 dyer sal 0.13
3 622 DR-1 1927-02-10 dyer rad 7.8
4 622 DR-1 1927-02-10 dyer sal 0.09
5 734 DR-3 1930-01-07 pb rad 8.41
6 734 DR-3 1930-01-07 lake sal 0.05
7 734 DR-3 1930-01-07 pb temp -21.5
8 735 DR-3 1930-01-12 pb rad 7.22
9 751 DR-3 1930-02-26 pb rad 4.35
10 751 DR-3 1930-02-26 pb temp -18.5
11 837 MSK-4 1932-01-14 lake rad 1.46
12 837 MSK-4 1932-01-14 lake sal 0.21
13 837 MSK-4 1932-01-14 roe sal 22.5
14 844 DR-1 1932-03-22 roe rad 11.2
# find the highest rad reading at each site.
max_rad <- combined %>%
filter(quantity == "rad") %>%
group_by(site_id) %>%
summarise(max_rad = max(reading))
# Find the date of the highest radiation reading at each site
combined %>%
filter(quantity == "rad") %>%
group_by(site_id, visit_date) %>%
summarize(max_rad = max(reading)) %>%
semi_join(max_rad) %>% # returns all rows from x with a match in y
select(visit_date, everything())
# A tibble: 3 x 3
# Groups: site_id [3]
visit_date site_id max_rad
<date> <chr> <dbl>
1 1932-03-22 DR-1 11.2
2 1930-01-07 DR-3 8.41
3 1932-01-14 MSK-4 1.46
# Write a function called summarize_table that takes a title string and a tibble as input and returns a string that says something like "title has n rows and n columns
summarize_table <- function(title, df) {
nrow <- nrow(df)
ncol <- ncol(df)
glue::glue("{title} has {nrow} rows and {ncol} columns." )
}
summarize_table("mtcars", mtcars)
mtcars has 32 rows and 11 columns.
# Write another function called show_columns that takes a string and a tibble as input and returns a string that says something like, “table has columns name, name, name”. For example, show_columns('person', person) should return the string "person has columns person_id, personal_name, family_name".
show_columns <- function(title, df) {
col_names <- names(df) %>%
str_c(collapse = ", ")
glue::glue("{title} has columns {col_names}")
}
show_columns('person', person)
person has columns person_id, personal_name, family_name
# The function long_name checks whether a string is longer than 4 characters. Use this function and a function from purrr to create a logical vector that contains the value TRUE where family names in the tibble person are longer than 4 characters, and FALSE where they are 4 characters or less.
long_name <- function(name) {
stringr::str_length(name) > 4
}
person %>%
mutate(long_family_name = map_lgl(family_name, long_name))
# A tibble: 5 x 4
person_id personal_name family_name long_family_name
<chr> <chr> <chr> <lgl>
1 dyer William Dyer FALSE
2 pb Frank Pabodie TRUE
3 lake Anderson Lake FALSE
4 roe Valentina Roerich TRUE
5 danforth Frank Danforth TRUE
https://marlycormar.github.io/tidyverse_sample_exam/sample_exam_sols/sols.html
The file at_health_facilities.csv contains a tidy dataset with four columns:
The ISO3 code of the country that reported data. The year for which data was reported. The percentage of HIV-positive children born to HIV-positive mothers age 15–17. The percentage of HIV-positive children born to HIV-positive mothers age 20–34. Please answer the following questions:
How many countries reported data? What is the difference between the minimum and maximum year with valid data for each country? How many countries reported data in 3 or more years? Which countries reported 100% incidence for at least one year in either age group?
hiv <- read_csv("https://education.rstudio.com/blog/2020/02/instructor-certification-exams/at_health_facilities.csv") %>%
janitor::clean_names()
glimpse(hiv)
Rows: 225
Columns: 4
$ iso3 <chr> "AFG", "ALB", "ALB", "ARG", "ARM", "ARM", "ARM", "…
$ year <dbl> 2010, 2005, 2008, 2012, 2000, 2005, 2010, 2006, 20…
$ age_15_17 <dbl> 33, 98, 98, 100, 93, 99, 100, 81, 12, 16, 19, 31, …
$ age_20_34 <chr> "29", "96", "98", "100", "87", "100", "100", "87",…
[1] 100
# what is the difference between min and max year with valid data for each country?
# age 20-34 has some dashes to be filtered out
hiv %>%
filter(age_20_34 != "-") %>%
group_by(iso3) %>%
summarise(diff = max(year) - min(year))
# A tibble: 100 x 2
iso3 diff
<chr> <dbl>
1 AFG 0
2 ALB 3
3 ARG 0
4 ARM 10
5 AZE 0
6 BDI 5
7 BEN 10
8 BFA 7
9 BGD 8
10 BIH 5
# … with 90 more rows
# how many countries reported data in 3 or more years?
hiv %>%
group_by(iso3) %>%
summarise(years_count = n_distinct(year)) %>%
filter(years_count >=3) # 34 countries
# A tibble: 34 x 2
iso3 years_count
<chr> <int>
1 ARM 3
2 BEN 3
3 BFA 3
4 BGD 6
5 CMR 3
6 COD 3
7 COL 3
8 DOM 3
9 EGY 5
10 ETH 3
# … with 24 more rows
# which countries reported 100% incidence rate for at least one year in either age group?
hiv %>%
filter(age_15_17 == 100 | age_20_34 == 100) %>%
distinct(iso3) # 18 countries
# A tibble: 18 x 1
iso3
<chr>
1 ARG
2 ARM
3 BRB
4 BLR
5 BIH
6 CUB
7 DOM
8 JAM
9 KAZ
10 KGZ
11 MKD
12 MDA
13 MNE
14 LCA
15 SRB
16 THA
17 UKR
18 URY
A student has sent you the file rmd-country-profile.Rmd, which is an R Markdown document analyzing the data in at_health_facilities.csv for Bangladesh. They could not knit the file, and are providing you with the raw .Rmd file instead of a rendered file.
Go through the file, fixing things that are preventing it from knitting cleanly. Change the two lines of bold text to H2-level headers to organize the document, and add a table of contents. Convert this R Markdown report for Bangladesh into a parameterized report with the country’s iso3 code as its parameter. Knit a new country profile for Egypt (ISO3 code “EGY”).
infant_hiv <- read_csv("https://education.rstudio.com/blog/2020/02/instructor-certification-exams/infant_hiv.csv")
# tidy the data into 3 col: iso3, year, state, number
tidy <- infant_hiv %>%
pivot_longer(!ISO3,
names_to = c("year", "state"),
names_pattern = "(.*) (.*)") %>%
mutate(value = case_when(
value == "-" | value == ">95%" ~NA_character_,
TRUE ~ str_replace(value, pattern = "%", replacement = "")
))
tidy
# A tibble: 5,184 x 4
ISO3 year state value
<chr> <chr> <chr> <chr>
1 AFG 2009 est <NA>
2 AFG 2009 hi <NA>
3 AFG 2009 lo <NA>
4 AFG 2010 est <NA>
5 AFG 2010 hi <NA>
6 AFG 2010 lo <NA>
7 AFG 2011 est <NA>
8 AFG 2011 hi <NA>
9 AFG 2011 lo <NA>
10 AFG 2012 est <NA>
# … with 5,174 more rows
# write the function
tidy_data <- function(file) {
# Import data
raw_data <- read_csv(file)
# Tidy data
tidy <- raw_data %>%
pivot_longer(!ISO3,
names_to = c("year", "state"),
names_pattern = "(.*) (.*)") %>%
mutate(value = case_when(
value == "-" | value == ">95%" ~NA_character_,
TRUE ~ str_replace(value, pattern = "%", replacement = "")
))
tidy
}
tidy_data("https://education.rstudio.com/blog/2020/02/instructor-certification-exams/infant_hiv.csv")
# A tibble: 5,184 x 4
ISO3 year state value
<chr> <chr> <chr> <chr>
1 AFG 2009 est <NA>
2 AFG 2009 hi <NA>
3 AFG 2009 lo <NA>
4 AFG 2010 est <NA>
5 AFG 2010 hi <NA>
6 AFG 2010 lo <NA>
7 AFG 2011 est <NA>
8 AFG 2011 hi <NA>
9 AFG 2011 lo <NA>
10 AFG 2012 est <NA>
# … with 5,174 more rows
ranking <- read_csv("https://education.rstudio.com/blog/2020/02/instructor-certification-exams/ranking.csv")
ranking %>%
group_by(item) %>%
count(rank) %>%
pivot_wider(names_from = rank,
values_from = n) %>%
mutate(num = sum(positive, negative, indifferent, na.rm = T)) %>%
mutate_at(vars(-item, -num), .funs = list(~ round(./num, digits = 2))) %>%
ggplot(aes(negative, positive, size = num)) +
geom_point(aes(alpha = 0.25)) +
geom_smooth(method = "lm")
https://tidyverse-exam-v2-solutions.netlify.app/
For attribution, please cite this work as
lruolin (2021, May 27). pRactice corner: Practice questions. Retrieved from https://lruolin.github.io/myBlog/posts/20210526_Practicing/
BibTeX citation
@misc{lruolin2021practice, author = {lruolin, }, title = {pRactice corner: Practice questions}, url = {https://lruolin.github.io/myBlog/posts/20210526_Practicing/}, year = {2021} }