Home > OS >  Unnesting elements of large list into dataframe
Unnesting elements of large list into dataframe

Time:10-28

I am currently working with a large dataset I retrieved from the crossref API in which I retrieved information on scientific papers based on a DOI search.

Currently the large list contains of ~3500 elements. Each of these elements is a list of their own consisting of the metadata 'meta', the actual relevant data 'data' and an irrelevant list 'facets'.

This is an example of two of the lists based on two DOI's:

list(`10.1158/1055-9965.EPI-08-0303` = list(meta = NULL, data = structure(list(
    alternative.id = "10.1158/1055-9965.EPI-08-0303", container.title = "Cancer Epidemiology Biomarkers & Prevention", 
    created = "2008-11-06", deposited = "2020-12-24", published.print = "2008-11", 
    published.online = "2008-11-06", doi = "10.1158/1055-9965.epi-08-0303", 
    indexed = "2021-10-17", issn = "1055-9965,1538-7755", issue = "11", 
    issued = "2008-11", member = "1086", page = "3216-3223", 
    prefix = "10.1158", publisher = "American Association for Cancer Research (AACR)", 
    score = "1", source = "Crossref", reference.count = "31", 
    references.count = "31", is.referenced.by.count = "50", subject = "Oncology,Epidemiology", 
    title = "20 Years into the Gambia Hepatitis Intervention Study: Assessment of Initial Hypotheses and Prospects for Evaluation of Protective Effectiveness Against Liver Cancer", 
    type = "journal-article", url = "http://dx.doi.org/10.1158/1055-9965.epi-08-0303", 
    volume = "17", language = "en", short.container.title = "Cancer Epidemiol Biomarkers Prev", 
    author = list(structure(list(given = c("Simonetta", "Patrizia", 
    "Ebrima", "Andrew J.", "Gregory D.", "Maimuna", "Ruggero", 
    "Amelie", "Omar", "Marianne", "Hilton", "Pierre"), family = c("Viviani", 
    "Carrieri", "Bah", "Hall", "Kirk", "Mendy", "Montesano", 
    "Plymoth", "Sam", "Van der Sande", "Whittle", "Hainaut"), 
        sequence = c("first", "additional", "additional", "additional", 
        "additional", "additional", "additional", "additional", 
        "additional", "additional", "additional", "additional"
        )), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
    -12L))), link = list(structure(list(URL = "https://syndication.highwire.org/content/doi/10.1158/1055-9965.EPI-08-0303", 
        content.type = "unspecified", content.version = "vor", 
        intended.application = "similarity-checking"), class = c("tbl_df", 
    "tbl", "data.frame"), row.names = c(NA, -1L)))), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -1L)), facets = NULL), 
    `10.1016/j.canlet.2007.10.044` = list(meta = NULL, data = structure(list(
        alternative.id = "S0304383507005253", container.title = "Cancer Letters", 
        created = "2008-01-14", deposited = "2019-01-01", published.print = "2008-03", 
        doi = "10.1016/j.canlet.2007.10.044", indexed = "2021-10-07", 
        issn = "0304-3835", issue = "1", issued = "2008-03", 
        member = "78", page = "21-25", prefix = "10.1016", publisher = "Elsevier BV", 
        score = "1", source = "Crossref", reference.count = "20", 
        references.count = "20", is.referenced.by.count = "71", 
        subject = "Cancer Research,Oncology", title = "Detection of R337H, a germline TP53 mutation predisposing to multiple cancers, in asymptomatic women participating in a breast cancer screening program in Southern Brazil", 
        type = "journal-article", url = "http://dx.doi.org/10.1016/j.canlet.2007.10.044", 
        volume = "261", language = "en", short.container.title = "Cancer Letters", 
        author = list(structure(list(given = c("Edenir Inêz", 
        "Lavínia", "Maira", "Maria Isabel Waddington", "Magali", 
        "Ghyslaine", "Virginie", "Ernestina", "Juliana", "Ingrid Petroni", 
        "Roberto", "Pierre", "Patricia"), family = c("Palmero", 
        "Schüler-Faccini", "Caleffi", "Achatz", "Olivier", "Martel-Planche", 
        "Marcel", "Aguiar", "Giacomazzi", "Ewald", "Giugliani", 
        "Hainaut", "Ashton-Prolla"), sequence = c("first", "additional", 
        "additional", "additional", "additional", "additional", 
        "additional", "additional", "additional", "additional", 
        "additional", "additional", "additional")), class = c("tbl_df", 
        "tbl", "data.frame"), row.names = c(NA, -13L))), link = list(
            structure(list(URL = c("https://api.elsevier.com/content/article/PII:S0304383507005253?httpAccept=text/xml", 
            "https://api.elsevier.com/content/article/PII:S0304383507005253?httpAccept=text/plain"
            ), content.type = c("text/xml", "text/plain"), content.version = c("vor", 
            "vor"), intended.application = c("text-mining", "text-mining"
            )), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
            -2L))), license = list(structure(list(date = "2008-03-01", 
            content.version = "tdm", delay.in.days = 0L, URL = "https://www.elsevier.com/tdm/userlicense/1.0/"), class = c("tbl_df", 
        "tbl", "data.frame"), row.names = c(NA, -1L))), reference = list(
            structure(list(key = c("10.1016/j.canlet.2007.10.044_bib1", 
            "10.1016/j.canlet.2007.10.044_bib2", "10.1016/j.canlet.2007.10.044_bib3", 
            "10.1016/j.canlet.2007.10.044_bib4", "10.1016/j.canlet.2007.10.044_bib5", 
            "10.1016/j.canlet.2007.10.044_bib6", "10.1016/j.canlet.2007.10.044_bib7", 
            "10.1016/j.canlet.2007.10.044_bib8", "10.1016/j.canlet.2007.10.044_bib9", 
            "10.1016/j.canlet.2007.10.044_bib10", "10.1016/j.canlet.2007.10.044_bib11", 
            "10.1016/j.canlet.2007.10.044_bib12", "10.1016/j.canlet.2007.10.044_bib13", 
            "10.1016/j.canlet.2007.10.044_bib14", "10.1016/j.canlet.2007.10.044_bib15", 
            "10.1016/j.canlet.2007.10.044_bib16", "10.1016/j.canlet.2007.10.044_bib17", 
            "10.1016/j.canlet.2007.10.044_bib18", "10.1016/j.canlet.2007.10.044_bib19", 
            "10.1016/j.canlet.2007.10.044_bib20"), doi.asserted.by = c("crossref", 
            "crossref", NA, NA, NA, "crossref", NA, NA, "crossref", 
            "crossref", "crossref", NA, NA, NA, "crossref", "crossref", 
            "crossref", "crossref", "crossref", NA), first.page = c("96", 
            "266", "1298", "877s", "1932", "12", "10", "608", 
            "2658", "1954", "133", "1703", "1365", "5358", "1215", 
            "607", "647", "607", "9330", "1213"), DOI = c("10.1016/j.canlet.2005.12.039", 
            "10.1159/000154228", NA, NA, NA, "10.1038/nsb730", 
            NA, NA, "10.1002/1097-0142(19901215)66:12<2658::AID-CNCR2820661232>3.0.CO;2-C", 
            "10.1038/sj.onc.1207305", "10.1016/0165-4608(93)90166-J", 
            NA, NA, NA, "10.1093/nar/16.3.1215", "10.1002/humu.10081", 
            "10.1590/S0004-27302004000500009", "10.1007/BF00202835", 
            "10.1073/pnas.161479898", NA), article.title = c("The TP53 mutation, R337H, is associated with Li–Fraumeni and Li–Fraumeni-like syndromes in Brazilian families", 
            "Is p53 polymorphism maintained by natural selection?", 
            "Prevalence and diversity of constitutional mutations in the p53 gene among 21 Li–Fraumeni families", 
            "Breast cancer screening in 10,000 women of an underserved population in South Brazil: The NMAMAPOA cohort", 
            "P53 germline mutations in childhood cancers and cancer risk for carrier individuals", 
            "A novel mechanism of tumorigenesis involving pH-dependent destabilization of a mutant p53 tetramer", 
            "Germline mutations in the TP53 gene", "Germ-line p53 mutations in 15 families with Li–Fraumeni syndrome", 
            "Choroid plexus tumors in the breast cancer-sarcoma syndrome", 
            "A TP53 polymorphism is associated with increased risk of colorectal cancer and with reduced levels of TP53 mRNA", 
            "Wilms’ tumor in the Li–Fraumeni cancer family syndrome", 
            "Simple sequence repeat polymorphism within the p53 gene", 
            "Rhabdomyosarcoma in children: epidemiologic study and identification of a familial cancer syndrome", 
            "A cancer family syndrome in twenty-four kindreds", 
            "A simple salting out procedure for extracting DNA from human nucleated cells", 
            "The IARC TP53 database: new online mutation analysis and recommendations to users", 
            "Founder effect for the highly prevalent R337H mutation of tumor suppressor p53 in Brazilian patients with adrenocortical tumors", 
            "Identification of a polymorphism in intron 2 of the p53 gene", 
            "An inherited p53 mutation that contributes in a tissue-specific manner to pediatric adrenal cortical carcinoma", 
            "Cancer in survivors of childhood soft tissue sarcoma and their relatives"
            ), volume = c("245", "44", "54", "23", "82", "9", 
            "25", "56", "66", "23", "67", "8", "43", "48", "16", 
            "19", "48", "93", "98", "79"), author = c("Achatz", 
            "Beckman", "Birch", "Caleffi", "Chompret", "DiGiammarino", 
            "Eeles", "Frebourg", "Garber", "Gemignani", "Hartley", 
            "Lazar", "Li", "Li", "Miller", "Olivier", "Pinto", 
            "Pleasants", "Ribeiro", "Strong"), year = c("2007", 
            "1994", "1994", "2005", "2000", "2002", "1995", "1995", 
            "1990", "2004", "1993", "1993", "1969", "1988", "1988", 
            "2002", "2004", "1994", "2001", "1987"), journal.title = c("Cancer Lett.", 
            "Hum. Hered.", "Cancer Res.", "J. Clin. Oncol.", 
            "Br. J. Cancer", "Nat. Struct. Biol.", "Cancer Surv.", 
            "Am. J. Hum. Genet.", "Cancer", "Oncogene", "Cancer Genet. Cytogenet.", 
            "Oncogene", "J. Natl. Cancer Inst.", "Cancer Res.", 
            "Nucleic Acids Res.", "Hum. Mutat.", "Arq. Bras. Endocrinol. Metabol.", 
            "Hum. Genet.", "Proc. Natl. Acad. Sci. USA", "J. Natl. Cancer Inst."
            ), issue = c(NA, "5", "5", "16 S", "12", "1", NA, 
            "3", "12", "10", "2", "6", "6", "18", "3", "6", "5", 
            "5", "16", "6")), class = c("tbl_df", "tbl", "data.frame"
            ), row.names = c(NA, -20L)))), class = c("tbl_df", 
    "tbl", "data.frame"), row.names = c(NA, -1L)), facets = NULL))

All the data under 'data' is relevant for me and I would like to unnest these and construct one large dataframe with the DOI's in one column and the data under 'data' the other columns.

I've tried using the unnest code unnest(data) but that results in Error in UseMethod("unnest") : no applicable method for 'unnest' applied to an object of class "list"

Is there an easy way to do this?

CodePudding user response:

Like this? Note - it is better to include a Minimal reprex that includes a toy data set, rather than a snapshot of what you have. This way the question will likely get answers faster.

ll <- list(`10.1016/j.ejca.2017.11.029` = list(metadata = NULL,
     data = tibble(one = 1, two = 2)),
     `10.1016/j.ejca.2017.12.500` = list(metadata = NULL,
                 data = tibble(one = 3, two = 4)))
nms <- names(ll)
vals <- lapply(ll, `[[`, 2)

tibble::tibble(DOI = nms,
       data = vals)

# or shorten to
tibble::tibble(DOI = names(ll),
               data = lapply(ll, `[[`, 2))

# A tibble: 2 x 2
  DOI                        data            
  <chr>                      <named list>    
1 10.1016/j.ejca.2017.11.029 <tibble [1 x 2]>
2 10.1016/j.ejca.2017.12.500 <tibble [1 x 2]>

Explanation > here the names contain information. The relevant information is the 2nd list element. lapply(ll, [[, 2) is equivalent to c(ll[[1]][[2]], ll[[2]][[2]], ... )

  • Related