Home > Mobile >  How to preprocess all datasets from folder through a script in R
How to preprocess all datasets from folder through a script in R

Time:03-07

I have folder with many files

C:/1/vyd

Names of these files

/new_vydel_1
/new_vydel_2
/new_vydel_n...
/new_vydel_725

short dput of 2 such datasets

new_vydel_1=structure(list(date = c("08.01.2018", "08.01.2018", "08.01.2018", 
"08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", 
"08.01.2018", "08.01.2018"), row = c(3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L), col = c(49L, 50L, 51L, 43L, 44L, 45L, 46L, 47L, 
48L, 49L), B1 = c(6914L, 6914L, 6914L, 6958L, 6958L, 6958L, 6958L, 
6914L, 6914L, 6914L), B2 = c(5560L, 5380L, 5644L, 5088L, 5280L, 
5200L, 5472L, 5568L, 5560L, 5424L), B3 = c(4768L, 4840L, 4936L, 
4320L, 4388L, 4572L, 4640L, 4704L, 4696L, 4488L), B4 = c(4960L, 
4964L, 4540L, 4164L, 4412L, 4608L, 4628L, 4588L, 4416L, 4312L
), B5 = c(5554L, 5554L, 4782L, 4736L, 4736L, 5018L, 5018L, 4968L, 
4968L, 4677L), B6 = c(5249L, 5249L, 4428L, 4553L, 4553L, 4832L, 
4832L, 4741L, 4741L, 4428L), B7 = c(4893L, 4893L, 4138L, 4527L, 
4527L, 4681L, 4681L, 4505L, 4505L, 4170L), B8 = c(4836L, 4840L, 
5044L, 4074L, 4236L, 4404L, 4592L, 4668L, 4796L, 4628L), B8A = c(4679L, 
4679L, 4098L, 4524L, 4524L, 4643L, 4643L, 4460L, 4460L, 3987L
), B9 = c(6752L, 6752L, 6752L, 7098L, 7098L, 7098L, 7098L, 6752L, 
6752L, 6752L), B10 = c(4170L, 4170L, 3407L, 3301L, 3301L, 3612L, 
3612L, 3600L, 3600L, 3352L), B11 = c(3124L, 3124L, 2514L, 2969L, 
2969L, 3137L, 3137L, 2922L, 2922L, 2487L), B12 = c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), COCта = c("2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", 
"2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", 
"2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е"
)), class = "data.frame", row.names = c(NA, -10L))

and

new_vydel_2=structure(list(date = c("08.01.2018", "08.01.2018", "08.01.2018", 
"08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", 
"08.01.2018", "08.01.2018"), row = c(4L, 4L, 5L, 5L, 5L, 6L, 
6L, 6L, 6L, 7L), col = c(5L, 6L, 4L, 5L, 6L, 4L, 5L, 6L, 7L, 
3L), B1 = c(8136L, 8136L, 7369L, 8136L, 8136L, 7369L, 8136L, 
8136L, 8136L, 7369L), B2 = c(5796L, 5756L, 5912L, 5956L, 6000L, 
6196L, 6044L, 6164L, 6268L, 6588L), B3 = c(4840L, 4936L, 5096L, 
5040L, 5096L, 5292L, 5280L, 5360L, 5480L, 5584L), B4 = c(4868L, 
4996L, 4908L, 4932L, 5060L, 5136L, 5280L, 5444L, 5492L, 5500L
), B5 = c(5327L, 5327L, 5301L, 5471L, 5471L, 5301L, 5471L, 5471L, 
5846L, 5977L), B6 = c(5207L, 5207L, 5087L, 5290L, 5290L, 5087L, 
5290L, 5290L, 5777L, 5721L), B7 = c(5235L, 5235L, 5001L, 5241L, 
5241L, 5001L, 5241L, 5241L, 5688L, 5539L), B8 = c(4876L, 4828L, 
4936L, 4992L, 5052L, 5236L, 5236L, 5336L, 5380L, 5708L), B8A = c(5209L, 
5209L, 4889L, 5233L, 5233L, 4889L, 5233L, 5233L, 5787L, 5324L
), B9 = c(8407L, 8407L, 7048L, 8407L, 8407L, 7048L, 8407L, 8407L, 
8407L, 7048L), B10 = c(3343L, 3343L, 3188L, 3438L, 3438L, 3188L, 
3438L, 3438L, 3774L, 3543L), B11 = c(2740L, 2740L, 2443L, 2800L, 
2800L, 2443L, 2800L, 2800L, 3228L, 2669L), B12 = c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), COCта = c("5OC3B1C1Е", "5OC3B1C1Е", 
"5OC3B1C1Е", "5OC3B1C1Е", "5OC3B1C1Е", "5OC3B1C1Е", "5OC3B1C1Е", 
"5OC3B1C1Е", "5OC3B1C1Е", "5OC3B1C1Е")), class = "data.frame", row.names = c(NA, 
-10L))

Each of these dataset i must preprocess through this script for example

treespring2=new_vydel_1 %>% 
  separate(COCта, into = c("B","E","C","OC","OLS"), sep=c(1,4,6,8), remove = F) %>% 
  mutate(across(c(C,B,E,OC,OLS), ~stringr::str_remove_all(., "[A-Z]"))) %>% 
  mutate(across(c(C,B,E,OC,OLS), ~as.numeric(.)*10))
str(treespring2)

But if I have 1000 datasets, then every time I have to manually change the dataset in this script. It's very long and cumbersome. Is there a way to automatically take all these datasets and preprocess them one by one through script, and then bind them through rbind into one dataframe, as in the example

Here dput() example of final result.

final=structure(list(date = c("08.01.2018", "08.01.2018", "08.01.2018", 
"08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", 
"08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", 
"08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", "08.01.2018", 
"08.01.2018", "08.01.2018"), row = c(3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L), col = c(49L, 
50L, 51L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 5L, 6L, 4L, 5L, 
6L, 4L, 5L, 6L, 7L, 3L), B1 = c(6914L, 6914L, 6914L, 6958L, 6958L, 
6958L, 6958L, 6914L, 6914L, 6914L, 8136L, 8136L, 7369L, 8136L, 
8136L, 7369L, 8136L, 8136L, 8136L, 7369L), B2 = c(5560L, 5380L, 
5644L, 5088L, 5280L, 5200L, 5472L, 5568L, 5560L, 5424L, 5796L, 
5756L, 5912L, 5956L, 6000L, 6196L, 6044L, 6164L, 6268L, 6588L
), B3 = c(4768L, 4840L, 4936L, 4320L, 4388L, 4572L, 4640L, 4704L, 
4696L, 4488L, 4840L, 4936L, 5096L, 5040L, 5096L, 5292L, 5280L, 
5360L, 5480L, 5584L), B4 = c(4960L, 4964L, 4540L, 4164L, 4412L, 
4608L, 4628L, 4588L, 4416L, 4312L, 4868L, 4996L, 4908L, 4932L, 
5060L, 5136L, 5280L, 5444L, 5492L, 5500L), B5 = c(5554L, 5554L, 
4782L, 4736L, 4736L, 5018L, 5018L, 4968L, 4968L, 4677L, 5327L, 
5327L, 5301L, 5471L, 5471L, 5301L, 5471L, 5471L, 5846L, 5977L
), B6 = c(5249L, 5249L, 4428L, 4553L, 4553L, 4832L, 4832L, 4741L, 
4741L, 4428L, 5207L, 5207L, 5087L, 5290L, 5290L, 5087L, 5290L, 
5290L, 5777L, 5721L), B7 = c(4893L, 4893L, 4138L, 4527L, 4527L, 
4681L, 4681L, 4505L, 4505L, 4170L, 5235L, 5235L, 5001L, 5241L, 
5241L, 5001L, 5241L, 5241L, 5688L, 5539L), B8 = c(4836L, 4840L, 
5044L, 4074L, 4236L, 4404L, 4592L, 4668L, 4796L, 4628L, 4876L, 
4828L, 4936L, 4992L, 5052L, 5236L, 5236L, 5336L, 5380L, 5708L
), B8A = c(4679L, 4679L, 4098L, 4524L, 4524L, 4643L, 4643L, 4460L, 
4460L, 3987L, 5209L, 5209L, 4889L, 5233L, 5233L, 4889L, 5233L, 
5233L, 5787L, 5324L), B9 = c(6752L, 6752L, 6752L, 7098L, 7098L, 
7098L, 7098L, 6752L, 6752L, 6752L, 8407L, 8407L, 7048L, 8407L, 
8407L, 7048L, 8407L, 8407L, 8407L, 7048L), B10 = c(4170L, 4170L, 
3407L, 3301L, 3301L, 3612L, 3612L, 3600L, 3600L, 3352L, 3343L, 
3343L, 3188L, 3438L, 3438L, 3188L, 3438L, 3438L, 3774L, 3543L
), B11 = c(3124L, 3124L, 2514L, 2969L, 2969L, 3137L, 3137L, 2922L, 
2922L, 2487L, 2740L, 2740L, 2443L, 2800L, 2800L, 2443L, 2800L, 
2800L, 3228L, 2669L), B12 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), COCта = c("2B2OC2OLS3C1Е", 
"2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", 
"2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", "2B2OC2OLS3C1Е", 
"2B2OC2OLS3C1Е", "5OC3B1C1Е", "5OC3B1C1Е", "5OC3B1C1Е", "5OC3B1C1Е", 
"5OC3B1C1Е", "5OC3B1C1Е", "5OC3B1C1Е", "5OC3B1C1Е", "5OC3B1C1Е", 
"5OC3B1C1Е"), B = c(20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 
20L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 50L), Е = c(20L, 
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 30L, 30L, 30L, 30L, 
30L, 30L, 30L, 30L, 30L, 30L), C = c(20L, 20L, 20L, 20L, 20L, 
20L, 20L, 20L, 20L, 20L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L), OC = c(30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 
30L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L), OLS = c(10L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, 
-20L))

all NA must be changed 0

How can i perform this manipulations? any help is valuable . Thank you.

CodePudding user response:

Maybe this works for you:

I'm going to assume your files as .csv. Create list of those files (list_dfs), and then import the files to a list(my_data):

list_dfs<-list.files("C:/1/vyd",pattern="csv$",full.names = T)

my_data<-lapply(list_dfs,read.delim)

Now, regarding the preprocessing create a function with the necessary tasks (preprocess and replace NAs):

preprocess<-function(dat){
  treespring2=dat %>% 
    separate(COCта, into = c("B","E","C","OC","OLS"), sep=c(1,4,6,8), remove = F) %>% 
    mutate(across(c(C,B,E,OC,OLS), ~stringr::str_remove_all(., "[A-Z]"))) %>% 
    mutate(across(c(C,B,E,OC,OLS), ~as.numeric(.)*10))
  
  treespring2[is.na(treespring2)]<-0
  
  return(treespring2)
  }

With this function apply it to the list of dataframes(my_data):

dat.prep<-lapply(my_data, FUN=preprocess)

To finalize use do.call function to rbind a list of dataframes:

dat.bind<-do.call("rbind",dat.prep)

And you might get the desired output.

  • Related