Home > front end >  Using pivot_longer() to gather up hundreds of like variables using REGEX tidyr
Using pivot_longer() to gather up hundreds of like variables using REGEX tidyr

Time:09-08

I have a question regarding pivot_longer() which does not appear to have been previously discussed. The dataset has over 10 thousand columns, but only 242 rows. Below only holds information for estimates, across all of these are 21 other variables (lower CI vs subtype etc) and they must be lengthened as well.

TEST_table <- structure(list(
Estimate.1.Country.... = c("45", "", "22", "39", "23", "2"),
Estimate.1.Year.Season = c("2017/18", "", "2017/18", "2012/13", "2013/14", ""),
Estimate.1.Objective.... = c("", "", "1A", "1A", "", ""),
Estimate.1.Risk.group.... = c("", "", "2", "2", "", ""),
Estimate.1.Risk.group.description = c("", "", "aged 65 years or older", "aged 65 years or older", "", ""),
Estimate.1.Age.range...lower = c("", "", "65", "65", "", ""),
Estimate.1.Age.range...upper = c("", "", "", "", "", ""),
Estimate.1.Sex = c("", "", "", "", "", ""),
Estimate.1.Vaccine.... = c("", "", "all vaccines", "all vaccines", "", ""),
Estimate.1.Match...y.n. = c("", "", "", "", "", ""),
Estimate.1.Vaccine.strains = c("", "", "", "", "", ""),
Estimate.1.Circulating.strains = c("A(H3N2) [dominant]","","","A(H3N2) [dominant]","A(H1N1)pdm09 [dominant]","A(H1N1)pdm09 [dominant]"),
Estimate.1.Outcome.... = c("1", "", "1", "1", "", ""),
Estimate.1.vs.virus..sub.type = c("", "", "influenza A and B", "influenza A", "", ""),
Estimate.1.Measure = c("", "", "aVE (%)", "aVE (%)", "", ""),
Estimate.1.Value = c("", "", "88.5", "34", "", ""),
Estimate.1.CI...lower = c(NA, NA, 38.9,-1, NA, NA),
Estimate.1.CI...upper = c(NA, NA, 97.8, 60, NA, NA),
Estimate.1.Comment = c("","",
"adjusted for age, sex, hospitalisations and GP visits in last 12 months, vaccination status for previous years, underling conditions, deterioration, dependency",
"adjusted for age, sex, underlying medical conditions, BMI, month of illness, and interval from ILI onset","",""),
Estimate.2.Country.... = c("", "", "22", "39", "", ""),
Estimate.2.Year.Season = c("", "", "2018/19", "2013/14", "", ""),
Estimate.2.Objective.... = c("", "", "1A", "1A", "", ""),
Estimate.2.Risk.group.... = c("", "", "2", "2", "", ""),
Estimate.2.Risk.group.description = c("", "", "aged 65 years or older", "aged 65 years or older", "", ""),
Estimate.2.n = c(NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_),
Estimate.2.Age.range...upper = c("", "", "", "", "", ""),
Estimate.2.Sex = c("", "", "", "", "", ""),
Estimate.2.Vaccine.... = c("", "", "influenza A and B", "all vaccines", "", ""),
Estimate.2.Match...y.n. = c("", "", "", "", "", ""),
Estimate.2.Vaccine.strains = c("", "", "", "", "", ""),
Estimate.2.Circulating.strains = c("", "", "", "A(H3N2) [dominant]", "", ""),
Estimate.2.Outcome.... = c("", "", "1", "1", "", ""),
Estimate.2.vs.virus..sub.type = c("", "", "influenza A and B", "influenza A", "", ""),
Estimate.2.Measure = c("", "", "aVE (%)", "aVE (%)", "", ""),
Estimate.2.Value = c(NA,NA, 61.7,-25, NA, NA),
Estimate.2.CI...lower = c(NA, NA,-59.9,-96, NA, NA),
Estimate.2.CI...upper = c(NA, NA, 90.9, 20, NA, NA),
Estimate.2.Comment = c("","",
"adjusted for age, sex, hospitalisations and GP visits in last 12 months, vaccination status for previous years, underling conditions, deterioration, dependency",
"adjusted for age, sex, underlying medical conditions, BMI, month of illness, and interval from ILI onset","",""),
Estimate.3.Country.... = c("45", "", "", "39", "", ""),
Estimate.3.Year.Season = c("2017/18", "", "", "2014/15", "", ""),
Estimate.3.Objective.... = c("1A", "", "", "1A", "", ""),
Estimate.3.Risk.group.... = c("2", "", "", "2", "", ""),
Estimate.3.Risk.group.description = c("aged 65 years or older", "", "", "aged 65 years or older", "", ""),
Estimate.3.n = c(NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_),
Estimate.3.N = c(NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_),
Estimate.3.Sex = c("", "", "", "", "", ""),
Estimate.3.Vaccine.... = c("1", "", "", "all vaccines", "", ""),
Estimate.3.Match...y.n. = c("", "", "", "", "", ""),
Estimate.3.Vaccine.strains = c("", "", "", "", "", ""),
Estimate.3.Circulating.strains = c("A(H3N2) [dominant]", "", "", "A(H3N2) [dominant]", "", ""),
Estimate.3.Outcome.... = c("1", "", "", "1", "", ""),
Estimate.3.vs.virus..sub.type = c("influenza A and B","", "", "influenza A", "", ""),
Estimate.3.Measure = c("uVE (%)","", "", "aVE (%)", "", ""),
Estimate.3.Value = c(4, NA, NA,-22, NA, NA),
Estimate.3.CI...lower = c(-13, NA, NA,-71, NA, NA),
Estimate.3.CI...upper = c(19, NA, NA, 13, NA, NA),
Estimate.3.Comment = c("","","",
"adjusted for age, sex, underlying medical conditions, BMI, month of illness, and interval from ILI onset","",""),
Estimate.4.Country.... = c("45", "", "", "39", "", "")),
row.names = c(NA, 6L),class = "data.frame")

As not fully seen in the data above I have 500 columns for individual values that I am looking to gather/pivot longer. Because they are not separated by values such as _ I need to make as many names_to = as names_patterns = are available.

This is my coding, but no matter how I enter REGEX codes it does not appear to work. I would need 22 patterns that recognise strings within column names for example "Country" or "Subtype" and count them as the same variable for lengthening purposes.

Data.L %>% pivot_longer(cols = 153:10652,
           names_to = c('Country', '.value', 'Season', '.value', 'Objective', '.value', 'RiskGroup', '.value','RiskGroupDescription', '.value',
                       'n', '.value', 'N', '.value', 'AgeRangeLower', '.value', 'AgeRangerUpper', '.value', 'Sex', '.value', 'Vaccine(#)', '.value',
                       'VaccineMatch', '.value', 'VaccineStrains',  '.value', 'CirculatingStrains', '.value', 'Outcome', '.value',
                       'VirusSubtype', '.value', 'EstimateMeasure', '.value', 'EstimateValue', '.value',
                       'EstimateLowerCI', '.value', 'EstimateUpperCI', '.value', 'Comment', '.value'),
           # names_prefix = 'Estimate.',
           names_patterns = c())

Name patterns that I attempted include (BUT certainly not limited to):

'(.*Country.*)', '(.*)Year(.*)', '(.*)Objective(.*)', '(.*)Risk.group..(.*)',
                                     '(.*)Risk.group.d(.*)', '(.*)n(.*)', '(.*)N(.*)', '(.*)Age.range...lower(.*)',
                                     '(.*)Age.range...upper(.*)', '(.*)Sex(.*)', '(.*)Vaccine....(.*)', '(.*)Match(.*)',
                                     '(.*)Vaccine.s(.*)', '(.*)Cir(.*)', '(.*)Outcome(.*)', '(.*)vs.(.*)', '(.*)Mea(.*)', '(.*)Value(.*)',
                                     '(.*)CI...lower(.*)', '(.*)CI...upper(.*)', '(.*)Com(.*)'

Any suggestions greatly appreciated. I am hoping to accomplish this using regex that way I am not manually compiling this via reshape etc.

Cheers

CodePudding user response:

You could set ".value" in names_to and supply one of names_sep or names_pattern to specify how the column names should be split.

library(tidyr)

TEST_table %>%
  pivot_longer(
    everything(),
    names_to = c("Est", ".value"),
    names_pattern = "([^.] \\.\\d )\\.(. )",
    values_transform = list(Value = as.numeric)
  )

# A tibble: 24 × 22
   Est        Country.... Year.Season Objective.... Risk.group.... Risk.group.description   Age.range...lower Age.range...upper Sex   Vaccine....
   <chr>      <chr>       <chr>       <chr>         <chr>          <chr>                    <chr>             <chr>             <chr> <chr>
 1 Estimate.1 "45"        "2017/18"   ""            ""             ""                       ""                ""                ""    ""
 2 Estimate.2 ""          ""          ""            ""             ""                        NA               ""                ""    ""
 3 Estimate.3 "45"        "2017/18"   "1A"          "2"            "aged 65 years or older"  NA                NA               ""    "1"
 4 Estimate.4 "45"         NA          NA            NA             NA                       NA                NA                NA    NA
 5 Estimate.1 ""          ""          ""            ""             ""                       ""                ""                ""    ""
 6 Estimate.2 ""          ""          ""            ""             ""                        NA               ""                ""    ""
 7 Estimate.3 ""          ""          ""            ""             ""                        NA                NA               ""    ""
 8 Estimate.4 ""           NA          NA            NA             NA                       NA                NA                NA    NA
 9 Estimate.1 "22"        "2017/18"   "1A"          "2"            "aged 65 years or older" "65"              ""                ""    "all vaccines"     
10 Estimate.2 "22"        "2018/19"   "1A"          "2"            "aged 65 years or older"  NA               ""                ""    "influenza A and B"
# … with 14 more rows

Note: values_transform = list(Value = as.numeric) gets rid of the following error:

Error in `pivot_longer_spec()`:
! Can't combine `Estimate.1.Value` <character> and `Estimate.2.Value` <double>.
  • Related