I am importing multiple excel files. The files have a non-standard structure, but all have the required data after a row of headers, midway down the rows of the data frame.
Here is a MWE:
df= structure(list(...1 = c("CPET Results", NA, "Operator", NA, NA,
"Patient data", NA, "Administrative Data", "ID", "Title", "Last Name",
"First Name", "Name Addition", "Sex", "Date of Birth", NA, NA,
"Biological and Medical Baseline Data", "Height", "Weight", "Mask",
"Race", "Body Fat", "Hip/Waist Ratio", "BMI", "Estimated Fitness Level",
"BSA", "Hct", "Hb", "Medication that changes the Heart Rate",
"Medication", "Existing Medical Conditions", NA, NA, NA, "Test data",
"Start Time", "Duration", "CPET device", "Serial number", "Firmware version",
"Flow Sensor", "Temperature", "Barometric Pressure", "Humidity",
NA, NA, NA, "Variable", "V'O2", "V'CO2", "V'O2/kg", "V'O2/HR",
"HR", "V'E/V'O2", "V'E/V'CO2", "V'E", "BF", "RER", "WR", NA,
NA, "t", "h:mm:ss.ms", "0:00:25.000", "0:00:26.000", "0:00:27.000",
"0:00:28.000", "0:00:29.000", "0:00:30.000"), ...2 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Unit",
"L/min", "L/min", "ml/min/kg", "ml", "/min", NA, NA, "L/min",
"/min", NA, "W", NA, NA, "Phase", NA, "Rest", "Rest", "Rest",
"Rest", "Rest", "Rest"), ...3 = c(NA, NA, NA, NA, NA, NA, NA,
NA, "343", NA, "GFRex", "343", NA, "female", "21/05/1924", NA,
NA, NA, "178 cm", "88.2 kg", "Blue, medium", NA, NA, "0.96",
"28", NA, "2.06 m2", NA, NA, NA, NA, NA, NA, NA, NA, NA, "12/04/2021 11:27 AM",
"0:15:12", "MetaLyzer 3B-R3", "231821624", "1.3.10", NA, "21.5°C",
"1030mBar", "36%", NA, NA, NA, "Rest", "0.36", "0.31", "4", "0",
"-", "35.7", "40.0", "14.9", "14", "0.88", "0", NA, NA, "Marker",
NA, NA, NA, NA, NA, NA, NA), ...4 = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Unloaded Pedalling",
"-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", NA, NA,
"V'O2", "L/min", "0.61123179277253403", "0.61123179277253403",
"0.61123179277253403", "0.61123179277253403", "0.51731964113453299",
"0.51731964113453299"), ...5 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "Warm Up", "0.61", "0.47",
"7", "0", "-", "26.2", "33.9", "18.5", "16", "0.77", "0", NA,
NA, "V'O2/kg", "ml/min/kg", "6.9339965147196203", "6.9339965147196203",
"6.9339965147196203", "6.9339965147196203", "5.8686289408341796",
"5.8686289408341796"), ...6 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "VT1", "1.22", "0.98", "14",
"-", "-", "28.6", "32.3", "35.4", "22", "0.88", "71", NA, NA,
"V'O2/HR", "ml", "0", "0", "0", "0", "0", "0"), ...7 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "VT1 % Norm",
"145", "-", "145", "-", "-", "-", "-", "-", "102", "-", "131",
NA, NA, "HR", "/min", NA, NA, NA, NA, NA, NA), ...8 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "VT1 % Max",
"69", "55", "69", "-", "-", "83", "96", "54", "76", "87", "55",
NA, NA, "WR", "W", "0", "0", "0", "0", "0", "0"), ...9 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "VT2",
"1.71", "1.66", "19", "-", "-", "31.9", "32.7", "60.0", "32",
"0.97", "122", NA, NA, "V'E/V'O2", NA, "30.6521809263484", "30.6521809263484",
"30.6521809263484", "30.6521809263484", "34.760039405568897",
"34.760039405568897"), ...10 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "VT2 % Norm", "203", "-",
"203", "-", "-", "-", "-", "-", "147", "-", "226", NA, NA, "V'E/V'CO2",
NA, "35.697970640705897", "35.697970640705897", "35.697970640705897",
"35.697970640705897", "39.618822090063901", "39.618822090063901"
), ...11 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "VT2 % Max", "97", "93", "97", "-", "-", "93",
"97", "92", "110", "96", "96", NA, NA, "RER", NA, "0.858653317715381",
"0.858653317715381", "0.858653317715381", "0.858653317715381",
"0.87736175817015105", "0.87736175817015105"), ...12 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "V'O2peak",
"1.77", "1.79", "20", "0", "-", "34.3", "33.8", "65.4", "29",
"1.01", "128", NA, NA, "V'E", "L/min", "23.334937499999999",
"23.334937499999999", "23.334937499999999", "23.334937499999999",
"21.762284444444401", "21.762284444444401"), ...13 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "V'O2peak % Norm",
"210", "-", "210", "0", "-", "-", "-", "-", "135", "-", "237",
NA, NA, "VT", "L", "0.86250000000000004", "0.86250000000000004",
"0.86250000000000004", "0.86250000000000004", "0.97866666666666702",
"0.97866666666666702"), ...14 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "Normal", "0.84", "-", "10",
"8", "104", "-", "-", "-", "22", "-", "54", NA, NA, "BF", "/min",
"27.055", "27.055", "27.055", "27.055", "22.2366666666667", "22.2366666666667"
), ...15 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "Absolute Maximum Values", "2.06", "1.96", "23",
"0", "-", "44.5", "37.2", "73.5", "34", "1.19", "128", NA, NA,
"V'CO2", "L/min", "0.52483620675725695", "0.52483620675725695",
"0.52483620675725695", "0.52483620675725695", "0.45387646988174501",
"0.45387646988174501"), ...16 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, "WR", "W", "0", "0", "0", "0", "0",
"0")), row.names = c(NA, -70L), class = c("tbl_df", "tbl", "data.frame"
))
I want to remove all rows before ...1 == "t"
. I'm importing multiple files and want to do this to all of them at the sametime, and the header "t" appears at a different row number in each file.
I have tried
df1 = df[-c(1:row_number(df$...1 =="t")),]
df1 = df[-c(rownames(df[df$...1 =="t",])),]
I'd like a base R or dplyr solution. Thanks
CodePudding user response:
In dplyr, the slice
function can be used to select rows by index, and the base-R which()
can tell you which row index to start at.
df %>%
slice(min(which(...1 == 't')):n())
This code will check for any rows on which ...1 == 't'
, then which()
tells you the row index. min()
is in case you get a file with two rows of 't'
. Then, the slice
picks all rows from that row you just found to the end (n()
).
CodePudding user response:
in base R you could do:
df[-seq(which(df[, '...1'] == 't') - 1),]