Webchem: a very useful package!
Before I knew how to export the database out from FlavorBase (a commercial flavor database), I had to type CAS numbers and FEMA numbers manually. I wished there was a way for me to tap on the existing open-source web resources, and at that time, I came across the webchem package, but I had little coding skills and did not know how to iteratively query for multiple compounds.
I revisited the package, and as a continuation for my start to pdf scraping, let me try to append chemical information to the compounds!
It loads data from 14 web databases, although you may need to have the API accounts for some of them.
The codes below are for searching from open-source databases.
The codes below were taken from the package documentation and are the more commonly used functions for my case.
CIR can resolve can be of the following identifier:
cir_query("allyl propyl disulfide", representation = "cas",
match = "first") # search by name
$`allyl propyl disulfide`
[1] "2179-59-1"
cir_query("Vanillin", representation = "names") # search by name
$Vanillin
[1] "4-Hydroxy-3-methoxybenzaldehyde"
[2] "4-Hydroxy-3-methoxy-benzaldehyde"
[3] "121-33-5"
[4] "52447-63-9"
[5] "8014-42-4"
[6] "c0193"
[7] "trans-2-Ethoxy-5-(1-propenyl)phenol"
[8] "4-08-00-01763 (Beilstein Handbook Reference)"
[9] "AI3-00093"
[10] "4-Hydroxy-3-methoxy-benzaldehyde"
[11] "4-Hydroxy-3-methoxybenzaldehyde"
[12] "C00755"
[13] "Vanillaldehyde"
[14] "HSDB 1027"
[15] "Methylprotocatechuic aldehyde"
[16] "NSC 15351"
[17] "Protocatechualdehyde 3-methyl ether"
[18] "Vanillin (natural)"
[19] "Vanillin [USAN]"
[20] "Vanilline"
[21] "p-Vanillin"
[22] "SGCUT00016"
[23] "ZINC02567933"
[24] "NSC403658"
[25] "D00091"
[26] "Vanillin (NF)"
[27] "W310700_ALDRICH"
[28] "W310727_ALDRICH"
[29] "AIDS-017927"
[30] "AIDS017927"
[31] "AC-907/21098004"
[32] "94750_FLUKA"
[33] "SBB000108"
[34] "BRN 0472792"
[35] "LS-2459"
[36] "CCRIS 2687"
[37] "CHEBI:18346"
[38] "vaniline"
[39] "EINECS 204-465-2"
[40] "V1104_ALDRICH"
[41] "FEMA No. 3107"
[42] "NCI60_001085"
[43] "94752_FLUKA"
[44] "NCGC00091645-03"
[45] "VANILLIN"
[46] "nchembio882-comp7"
[47] "NSC48383"
[48] "4-HYDROXY,3-METHOXY-BENZALDEHYDE"
[49] "NCGC00091645-01"
[50] "to_000089"
[51] "V2375_SIGMA"
[52] "InChI=1/C8H8O3/c1-11-8-4-6(5-9)2-3-7(8)10/h2-5,10H,1H"
[53] "LS-645"
[54] "2-Methoxy-4-formylphenol"
[55] "3-Methoxy-4-hydroxybenzaldehyde"
[56] "4-Formyl-2-methoxyphenol"
[57] "4-Hydroxy-5-methoxybenzaldehyde"
[58] "4-Hydroxy-m-anisaldehyde"
[59] "Benzaldehyde, 4-hydroxy-3-methoxy-"
[60] "Lioxin"
[61] "NSC15351"
[62] "Protocatechualdehyde, methyl-"
[63] "Vanilla"
[64] "Vanillic aldehyde"
[65] "WLN: VHR DQ CO1"
[66] "Zimco"
[67] "m-Anisaldehyde, 4-hydroxy-"
[68] "p-Hydroxy-m-methoxybenzaldehyde"
[69] "4-Hydroxy-3-methoxybenzaldehyde"
[70] "4-Hydroxy-3-methoxy-benzaldehyde"
[71] "121-33-5"
[72] "52447-63-9"
[73] "8014-42-4"
[74] "c0193"
[75] "trans-2-Ethoxy-5-(1-propenyl)phenol"
[76] "4-08-00-01763 (Beilstein Handbook Reference)"
[77] "AI3-00093"
[78] "4-Hydroxy-3-methoxy-benzaldehyde"
[79] "4-Hydroxy-3-methoxybenzaldehyde"
[80] "C00755"
[81] "Vanillaldehyde"
[82] "HSDB 1027"
[83] "Methylprotocatechuic aldehyde"
[84] "NSC 15351"
[85] "Protocatechualdehyde 3-methyl ether"
[86] "Vanillin (natural)"
[87] "Vanillin [USAN]"
[88] "Vanilline"
[89] "p-Vanillin"
[90] "SGCUT00016"
[91] "ZINC02567933"
[92] "NSC403658"
[93] "D00091"
[94] "Vanillin (NF)"
[95] "W310700_ALDRICH"
[96] "W310727_ALDRICH"
[97] "AIDS-017927"
[98] "AIDS017927"
[99] "AC-907/21098004"
[100] "94750_FLUKA"
[101] "SBB000108"
[102] "BRN 0472792"
[103] "LS-2459"
[104] "CCRIS 2687"
[105] "CHEBI:18346"
[106] "vaniline"
[107] "EINECS 204-465-2"
[108] "V1104_ALDRICH"
[109] "FEMA No. 3107"
[110] "NCI60_001085"
[111] "94752_FLUKA"
[112] "NCGC00091645-03"
[113] "VANILLIN"
[114] "nchembio882-comp7"
[115] "NSC48383"
[116] "4-HYDROXY,3-METHOXY-BENZALDEHYDE"
[117] "NCGC00091645-01"
[118] "to_000089"
[119] "V2375_SIGMA"
[120] "InChI=1/C8H8O3/c1-11-8-4-6(5-9)2-3-7(8)10/h2-5,10H,1H"
[121] "LS-645"
[122] "2-Methoxy-4-formylphenol"
[123] "3-Methoxy-4-hydroxybenzaldehyde"
[124] "4-Formyl-2-methoxyphenol"
[125] "4-Hydroxy-5-methoxybenzaldehyde"
[126] "4-Hydroxy-m-anisaldehyde"
[127] "Benzaldehyde, 4-hydroxy-3-methoxy-"
[128] "Lioxin"
[129] "NSC15351"
[130] "Protocatechualdehyde, methyl-"
[131] "Vanilla"
[132] "Vanillic aldehyde"
[133] "WLN: VHR DQ CO1"
[134] "Zimco"
[135] "m-Anisaldehyde, 4-hydroxy-"
[136] "p-Hydroxy-m-methoxybenzaldehyde"
cir_query("Vanillin", representation = "names", match = "first") # first match only
$Vanillin
[1] "4-Hydroxy-3-methoxybenzaldehyde"
cir_query("121-33-5", representation = "names") # search by CAS
$`121-33-5`
[1] "4-Hydroxy-3-methoxybenzaldehyde"
[2] "4-Hydroxy-3-methoxy-benzaldehyde"
[3] "121-33-5"
[4] "52447-63-9"
[5] "8014-42-4"
[6] "c0193"
[7] "trans-2-Ethoxy-5-(1-propenyl)phenol"
[8] "4-08-00-01763 (Beilstein Handbook Reference)"
[9] "AI3-00093"
[10] "4-Hydroxy-3-methoxy-benzaldehyde"
[11] "4-Hydroxy-3-methoxybenzaldehyde"
[12] "C00755"
[13] "Vanillaldehyde"
[14] "HSDB 1027"
[15] "Methylprotocatechuic aldehyde"
[16] "NSC 15351"
[17] "Protocatechualdehyde 3-methyl ether"
[18] "Vanillin (natural)"
[19] "Vanillin [USAN]"
[20] "Vanilline"
[21] "p-Vanillin"
[22] "SGCUT00016"
[23] "ZINC02567933"
[24] "NSC403658"
[25] "D00091"
[26] "Vanillin (NF)"
[27] "W310700_ALDRICH"
[28] "W310727_ALDRICH"
[29] "AIDS-017927"
[30] "AIDS017927"
[31] "AC-907/21098004"
[32] "94750_FLUKA"
[33] "SBB000108"
[34] "BRN 0472792"
[35] "LS-2459"
[36] "CCRIS 2687"
[37] "CHEBI:18346"
[38] "vaniline"
[39] "EINECS 204-465-2"
[40] "V1104_ALDRICH"
[41] "FEMA No. 3107"
[42] "NCI60_001085"
[43] "94752_FLUKA"
[44] "NCGC00091645-03"
[45] "VANILLIN"
[46] "nchembio882-comp7"
[47] "NSC48383"
[48] "4-HYDROXY,3-METHOXY-BENZALDEHYDE"
[49] "NCGC00091645-01"
[50] "to_000089"
[51] "V2375_SIGMA"
[52] "InChI=1/C8H8O3/c1-11-8-4-6(5-9)2-3-7(8)10/h2-5,10H,1H"
[53] "LS-645"
[54] "2-Methoxy-4-formylphenol"
[55] "3-Methoxy-4-hydroxybenzaldehyde"
[56] "4-Formyl-2-methoxyphenol"
[57] "4-Hydroxy-5-methoxybenzaldehyde"
[58] "4-Hydroxy-m-anisaldehyde"
[59] "Benzaldehyde, 4-hydroxy-3-methoxy-"
[60] "Lioxin"
[61] "NSC15351"
[62] "Protocatechualdehyde, methyl-"
[63] "Vanilla"
[64] "Vanillic aldehyde"
[65] "WLN: VHR DQ CO1"
[66] "Zimco"
[67] "m-Anisaldehyde, 4-hydroxy-"
[68] "p-Hydroxy-m-methoxybenzaldehyde"
# multiple inputs ------
comp <- c("Vanillin", "Ethyl vanillin")
cir_query(comp, "cas", match = "first")
$Vanillin
[1] "121-33-5"
$`Ethyl vanillin`
[1] "121-32-4"
cir_query("Vanillin", representation = "mw")
$Vanillin
[1] 152.1494 152.1494
cir_query("Vanillin", representation = "formula")
$Vanillin
[1] "C8H8O3" "C8H8O3"
cir_query("Vanillin", representation = "ring_count")
$Vanillin
[1] 1 1
This value is greater than 1 if the compound is more hydrophobic/non-polar, and is less than 1 if the compound is more hydrophilic/polar.
cir_query("Vanillin", representation = "xlogp2")
$Vanillin
[1] 1.227 1.227
cir_query("Vanillin", representation = "aromatic")
$Vanillin
[1] "1" "1"
cir_query("Vanillin", representation = "smiles")
$Vanillin
[1] "COc1cc(C=O)ccc1O" "COc1cc(C=O)ccc1O"
cir_query("Vanillin", representation = "stdinchi")
$Vanillin
[1] "InChI=1S/C8H8O3/c1-11-8-4-6(5-9)2-3-7(8)10/h2-5,10H,1H3"
[2] "InChI=1S/C8H8O3/c1-11-8-4-6(5-9)2-3-7(8)10/h2-5,10H,1H3"
cir_query("Vanillin", representation = "stdinchikey")
$Vanillin
[1] "InChIKey=MWOOGOJBHIARFG-UHFFFAOYSA-N"
[2] "InChIKey=MWOOGOJBHIARFG-UHFFFAOYSA-N"
as.cas("121335")
121-33-5
"121-33-5"
This is not the most comprehensive database and I prefer the output from Flavorbase. Perhaps, if I can successfully use the data from Fenerolli’s book, that would be a good addition for aroma and taste descriptors.
cir_query("123-32-0", "names")
$`123-32-0`
[1] "2,5-Dimethylpyrazine"
[2] "123-32-0"
[3] "2,5-Dimethyl-1,4-diazine"
[4] "2,5-Dimethylparadiazine"
[5] "2,5-Dimethylpiazine"
[6] "2,5-Dimethylpyrazine (natural)"
[7] "AI3-60303"
[8] "CCRIS 2929"
[9] "EINECS 204-618-3"
[10] "FEMA No. 3272"
[11] "NSC 49139"
[12] "W327204_ALDRICH"
[13] "ZINC00003182"
[14] "InChI=1/C6H8N2/c1-5-3-8-6(2)4-7-5/h3-4H,1-2H"
[15] "2,5-Dimethylpyrazine"
[16] "NSC49139"
[17] "Pyrazine, 2,5-dimethyl-"
[18] "WLN: T6N DNJ B1 E1"
[19] "PYRAZINE,2,5-DIMETHYL"
[20] "ST5437415"
[21] "175420_ALDRICH"
[22] "41535_FLUKA"
fn_percept("123-32-0")
123-32-0
"cocoa, roasted nut, roast beef, medicine"
# multiple input
CASs <- c("75-07-0", "64-17-5", "109-66-0", "78-94-4", "78-93-3")
fn_percept(CASs)
75-07-0 64-17-5 109-66-0 78-94-4
"pungent, ether" "sweet" "alkane" NA
78-93-3
"ether"
The types of retention indices included in NIST include Kovats (“kovats”), Van den Dool and Kratz (“linear”), normal alkane (“alkane”), and Lee (“lee”).
You can choose to specify your search limits.
# Search NIST RI ------
nist_ri("123-32-0",
from = "cas",
type = "kovats",
polarity = "non-polar")
# A tibble: 5 × 12
query RI type phase length gas substrate diameter thickness
<chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl>
1 123-32-0 930 Capillary SE-30 25 Heli… <NA> 0.32 1
2 123-32-0 926 Capillary SE-30 25 Heli… <NA> 0.32 1
3 123-32-0 930 Capillary OV-1… 50 Heli… <NA> 0.25 NA
4 123-32-0 926 Capillary OV-1… 50 Heli… <NA> 0.25 NA
5 123-32-0 890 Packed DC-2… 4 <NA> Celite NA NA
# … with 3 more variables: temp <dbl>, reference <chr>, comment <chr>
The map function from purrr is extremely handy here!
# define a function to search cas
search_cas <- function(chem_name) {
webchem::cir_query({{chem_name}}, representation = "cas",
match = "first") %>%
unlist(.)
}
# define a function to search mw
search_mw <- function(cas) {
webchem::cir_query({{cas}}, representation = "mw") %>%
unlist(.) %>%
as.numeric()
}
# define a function to search formula
search_formula <- function(cas) {
webchem::cir_query({{cas}}, representation = "formula") %>%
unlist(.)
}
# define a function to search log-ow
search_log_o_w <- function(cas) {
webchem::cir_query({{cas}}, representation = "xlogp2") %>%
unlist(.)
}
# define a function to search fnpercep
search_flv_percept <- function(cas) {
webchem::fn_percept({{cas}}) %>%
unlist(.)
}
# define a function for NIST KI
search_nist_ki <- function(cas) {
ri <- nist_ri("123-32-0",
from = "cas",
type = "kovats",
polarity = "non-polar")
round(mean(ri$RI), 1)
}
Let’s just randomly pick three compounds
chem_names <- tibble::tribble(
~chem_names,
"ethyl vanillin",
"ethyl butyrate",
"2,6-dimethylpyrazine"
)
# merge in
report_expanded <- chem_names %>%
mutate(cas = map_chr(chem_names, search_cas)) %>%
mutate(mw = map_dbl(cas, search_mw)) %>%
mutate(chem_form = map_chr(cas, search_formula)) %>%
mutate(log_o_w = map_dbl(cas, search_log_o_w)) %>%
mutate(flv_percept = map_chr(cas, search_flv_percept))
report_expanded
# A tibble: 3 × 6
chem_names cas mw chem_form log_o_w flv_percept
<chr> <chr> <dbl> <chr> <dbl> <chr>
1 ethyl vanillin 121-32-4 166. C9H10O3 1.65 <NA>
2 ethyl butyrate 105-54-4 116. C6H12O2 1.49 apple
3 2,6-dimethylpyrazine 108-50-9 108. C6H8N2 0.633 roasted nut, …
I am not uploading my excel file (with compounds to query for) in this case, but here is an example code. I would need to import two files:
I would query for each of the compound iternatively.
# load packages ------
library(pacman)
p_load(datapasta, tidyverse, webchem, readxl, writexl, janitor)
# import report ------
data <- read_xlsx("import_data.xlsx") %>%
clean_names() %>%
select(-tent_t, -fema_raw)
glimpse(data)
# import in-house database, use latest version ------
in_house_db <- read_excel("in_house.xlsx") %>%
clean_names() %>%
distinct(cas_combined, .keep_all = T)
glimpse(in_house_db)
# merge in sensory descriptor ------
data_sensory_desc <- data %>%
left_join(in_house_db, by = c("cas" = "cas_combined"))
glimpse(data_sensory_desc)
# merge in webchem queries
data_pubchem <- data_sensory_desc %>%
mutate(mw = map_chr(cas, search_mw)) %>%
mutate(chem_form = map_chr(cas, search_formula)) %>%
mutate(log_o_w = map_dbl(cas, search_log_o_w)) %>%
mutate(flv_percept = map_chr(cas, search_flv_percept)) %>%
mutate(nist_ki = map_dbl(cas, search_nist_ki))
glimpse(data_pubchem)
For attribution, please cite this work as
lruolin (2021, Oct. 15). pRactice corner: Retrieving chemical information from the web. Retrieved from https://lruolin.github.io/myBlog/posts/20211022 Retrieving chemical information/
BibTeX citation
@misc{lruolin2021retrieving, author = {lruolin, }, title = {pRactice corner: Retrieving chemical information from the web}, url = {https://lruolin.github.io/myBlog/posts/20211022 Retrieving chemical information/}, year = {2021} }