I have a question regarding pivot_longer() which does not appear to have been previously discussed. The dataset has over 10 thousand columns, but only 242 rows. Below only holds information for estimates, across all of these are 21 other variables (lower CI vs subtype etc) and they must be lengthened as well.
TEST_table <- structure(list(
Estimate.1.Country.... = c("45", "", "22", "39", "23", "2"),
Estimate.1.Year.Season = c("2017/18", "", "2017/18", "2012/13", "2013/14", ""),
Estimate.1.Objective.... = c("", "", "1A", "1A", "", ""),
Estimate.1.Risk.group.... = c("", "", "2", "2", "", ""),
Estimate.1.Risk.group.description = c("", "", "aged 65 years or older", "aged 65 years or older", "", ""),
Estimate.1.Age.range...lower = c("", "", "65", "65", "", ""),
Estimate.1.Age.range...upper = c("", "", "", "", "", ""),
Estimate.1.Sex = c("", "", "", "", "", ""),
Estimate.1.Vaccine.... = c("", "", "all vaccines", "all vaccines", "", ""),
Estimate.1.Match...y.n. = c("", "", "", "", "", ""),
Estimate.1.Vaccine.strains = c("", "", "", "", "", ""),
Estimate.1.Circulating.strains = c("A(H3N2) [dominant]","","","A(H3N2) [dominant]","A(H1N1)pdm09 [dominant]","A(H1N1)pdm09 [dominant]"),
Estimate.1.Outcome.... = c("1", "", "1", "1", "", ""),
Estimate.1.vs.virus..sub.type = c("", "", "influenza A and B", "influenza A", "", ""),
Estimate.1.Measure = c("", "", "aVE (%)", "aVE (%)", "", ""),
Estimate.1.Value = c("", "", "88.5", "34", "", ""),
Estimate.1.CI...lower = c(NA, NA, 38.9,-1, NA, NA),
Estimate.1.CI...upper = c(NA, NA, 97.8, 60, NA, NA),
Estimate.1.Comment = c("","",
"adjusted for age, sex, hospitalisations and GP visits in last 12 months, vaccination status for previous years, underling conditions, deterioration, dependency",
"adjusted for age, sex, underlying medical conditions, BMI, month of illness, and interval from ILI onset","",""),
Estimate.2.Country.... = c("", "", "22", "39", "", ""),
Estimate.2.Year.Season = c("", "", "2018/19", "2013/14", "", ""),
Estimate.2.Objective.... = c("", "", "1A", "1A", "", ""),
Estimate.2.Risk.group.... = c("", "", "2", "2", "", ""),
Estimate.2.Risk.group.description = c("", "", "aged 65 years or older", "aged 65 years or older", "", ""),
Estimate.2.n = c(NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_),
Estimate.2.Age.range...upper = c("", "", "", "", "", ""),
Estimate.2.Sex = c("", "", "", "", "", ""),
Estimate.2.Vaccine.... = c("", "", "influenza A and B", "all vaccines", "", ""),
Estimate.2.Match...y.n. = c("", "", "", "", "", ""),
Estimate.2.Vaccine.strains = c("", "", "", "", "", ""),
Estimate.2.Circulating.strains = c("", "", "", "A(H3N2) [dominant]", "", ""),
Estimate.2.Outcome.... = c("", "", "1", "1", "", ""),
Estimate.2.vs.virus..sub.type = c("", "", "influenza A and B", "influenza A", "", ""),
Estimate.2.Measure = c("", "", "aVE (%)", "aVE (%)", "", ""),
Estimate.2.Value = c(NA,NA, 61.7,-25, NA, NA),
Estimate.2.CI...lower = c(NA, NA,-59.9,-96, NA, NA),
Estimate.2.CI...upper = c(NA, NA, 90.9, 20, NA, NA),
Estimate.2.Comment = c("","",
"adjusted for age, sex, hospitalisations and GP visits in last 12 months, vaccination status for previous years, underling conditions, deterioration, dependency",
"adjusted for age, sex, underlying medical conditions, BMI, month of illness, and interval from ILI onset","",""),
Estimate.3.Country.... = c("45", "", "", "39", "", ""),
Estimate.3.Year.Season = c("2017/18", "", "", "2014/15", "", ""),
Estimate.3.Objective.... = c("1A", "", "", "1A", "", ""),
Estimate.3.Risk.group.... = c("2", "", "", "2", "", ""),
Estimate.3.Risk.group.description = c("aged 65 years or older", "", "", "aged 65 years or older", "", ""),
Estimate.3.n = c(NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_),
Estimate.3.N = c(NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_),
Estimate.3.Sex = c("", "", "", "", "", ""),
Estimate.3.Vaccine.... = c("1", "", "", "all vaccines", "", ""),
Estimate.3.Match...y.n. = c("", "", "", "", "", ""),
Estimate.3.Vaccine.strains = c("", "", "", "", "", ""),
Estimate.3.Circulating.strains = c("A(H3N2) [dominant]", "", "", "A(H3N2) [dominant]", "", ""),
Estimate.3.Outcome.... = c("1", "", "", "1", "", ""),
Estimate.3.vs.virus..sub.type = c("influenza A and B","", "", "influenza A", "", ""),
Estimate.3.Measure = c("uVE (%)","", "", "aVE (%)", "", ""),
Estimate.3.Value = c(4, NA, NA,-22, NA, NA),
Estimate.3.CI...lower = c(-13, NA, NA,-71, NA, NA),
Estimate.3.CI...upper = c(19, NA, NA, 13, NA, NA),
Estimate.3.Comment = c("","","",
"adjusted for age, sex, underlying medical conditions, BMI, month of illness, and interval from ILI onset","",""),
Estimate.4.Country.... = c("45", "", "", "39", "", "")),
row.names = c(NA, 6L),class = "data.frame")
As not fully seen in the data above I have 500 columns for individual values that I am looking to gather/pivot longer. Because they are not separated by values such as _ I need to make as many names_to = as names_patterns = are available.
This is my coding, but no matter how I enter REGEX codes it does not appear to work. I would need 22 patterns that recognise strings within column names for example "Country" or "Subtype" and count them as the same variable for lengthening purposes.
Data.L %>% pivot_longer(cols = 153:10652,
names_to = c('Country', '.value', 'Season', '.value', 'Objective', '.value', 'RiskGroup', '.value','RiskGroupDescription', '.value',
'n', '.value', 'N', '.value', 'AgeRangeLower', '.value', 'AgeRangerUpper', '.value', 'Sex', '.value', 'Vaccine(#)', '.value',
'VaccineMatch', '.value', 'VaccineStrains', '.value', 'CirculatingStrains', '.value', 'Outcome', '.value',
'VirusSubtype', '.value', 'EstimateMeasure', '.value', 'EstimateValue', '.value',
'EstimateLowerCI', '.value', 'EstimateUpperCI', '.value', 'Comment', '.value'),
# names_prefix = 'Estimate.',
names_patterns = c())
Name patterns that I attempted include (BUT certainly not limited to):
'(.*Country.*)', '(.*)Year(.*)', '(.*)Objective(.*)', '(.*)Risk.group..(.*)',
'(.*)Risk.group.d(.*)', '(.*)n(.*)', '(.*)N(.*)', '(.*)Age.range...lower(.*)',
'(.*)Age.range...upper(.*)', '(.*)Sex(.*)', '(.*)Vaccine....(.*)', '(.*)Match(.*)',
'(.*)Vaccine.s(.*)', '(.*)Cir(.*)', '(.*)Outcome(.*)', '(.*)vs.(.*)', '(.*)Mea(.*)', '(.*)Value(.*)',
'(.*)CI...lower(.*)', '(.*)CI...upper(.*)', '(.*)Com(.*)'
Any suggestions greatly appreciated. I am hoping to accomplish this using regex that way I am not manually compiling this via reshape etc.
Cheers
CodePudding user response:
You could set ".value"
in names_to
and supply one of names_sep
or names_pattern
to specify how the column names should be split.
library(tidyr)
TEST_table %>%
pivot_longer(
everything(),
names_to = c("Est", ".value"),
names_pattern = "([^.] \\.\\d )\\.(. )",
values_transform = list(Value = as.numeric)
)
# A tibble: 24 × 22
Est Country.... Year.Season Objective.... Risk.group.... Risk.group.description Age.range...lower Age.range...upper Sex Vaccine....
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Estimate.1 "45" "2017/18" "" "" "" "" "" "" ""
2 Estimate.2 "" "" "" "" "" NA "" "" ""
3 Estimate.3 "45" "2017/18" "1A" "2" "aged 65 years or older" NA NA "" "1"
4 Estimate.4 "45" NA NA NA NA NA NA NA NA
5 Estimate.1 "" "" "" "" "" "" "" "" ""
6 Estimate.2 "" "" "" "" "" NA "" "" ""
7 Estimate.3 "" "" "" "" "" NA NA "" ""
8 Estimate.4 "" NA NA NA NA NA NA NA NA
9 Estimate.1 "22" "2017/18" "1A" "2" "aged 65 years or older" "65" "" "" "all vaccines"
10 Estimate.2 "22" "2018/19" "1A" "2" "aged 65 years or older" NA "" "" "influenza A and B"
# … with 14 more rows
Note: values_transform = list(Value = as.numeric)
gets rid of the following error:
Error in `pivot_longer_spec()`:
! Can't combine `Estimate.1.Value` <character> and `Estimate.2.Value` <double>.