After extracting texts from pdf files, my variables contain literal \n
and \\n
. How can I remove them? I have tried form2_df$firm_new <- str_replace_all(form2_df$firm, "\\n", "")
. But it did nothing.
Here is my dput output:
structure(list(firm = c("\\n\\n X, P.C.\\n\\n", "\\n\\n \\\"Y & Company, CPA, PC\\n\\n",
"\\n\\n NGroup, Ltd LLP\\n\\n", "\\n\\n 247 ting, LLC\\n\\n"
), issuer_name = c("c(\"\\\\n New Continent Ltd.\\\\n\\\\n \", \"\\\\n FellCorp.\\\\n\\\\n \", \"\\\\n Chain New Ltd.\\\\n\\\\n \", \"\\\\n Fellazo Corp.\\\\n\\\\n \", \"\\\\n Seed Corp.\\\\n\\\\n \", \"\\\\n Greenland Technologies Horp.\\\\n\\\\n \", \"\\\\n Indoor \\\\n\\\\n \", \"\\\\n Packaging, Inc.\\\\n\\\\n \", \"\\\\n IT Tech Packaging, Inc.\\\\n\\\\n \", \"\\\\n Holdings, Inc.\\\\n\\\\n \", \"\\\\n PK Kirk Inc.\\\\n\\\\n \", \"\\\\n Planet Corp.\\\\n\\\\n \", \"\\\\n Art Co., Ltd.\\\\n\\\\n \", \"\\\\n Resource Group\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \")",
"c(\"\\\\n\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \")",
"c(\"\\\\n\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \")",
"c(\"\\\\n\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \", \"\\\\n\\\\n\\\\n \")"
), num = c("c(\"\\\\n 1641398 \", \"\\\\n 1659207 \", \"\\\\n 1641398 \", \"\\\\n 1659207 \", \"\\\\n 1524829 \", \"\\\\n 1735041 \", \n\"\\\\n 1572565 \", \"\\\\nC, P.C.: Annual Report OB 2 (v. 2.10) Page 7 / 24\\\\n 1358190 \", \"\\\\n 1358190 \", \"\\\\n 1816172 \", \n\"\\\\n 1833372 \", \"\\\\n 1117057111 \", \"\\\\n 1491487 \", \"\\\\n 1409431 \", \"\\\\n \", \"\\\\n 0000857455 \", \n\"\\\\n \", \"\\\\n 0000857455 \", \"\\\\n 0001090102 \", \"\\\\n 0000702238 \", \"\\\\n 0000857455 \", \"\\\\n 0001090102 \", \n\"\\\\n 0000702238 \", \"\\\\n 0001364891 \", \"\\\\n 1753567 \", \"\\\\nC, P.C.: Annual Report OB Form 2 (v. 2.10) Page 11 / 24\\\\n 861354 \", \n\"\\\\n 861354 \")",
"c(\"\\\\n d\\\\n e\\\\n f\\\\n g\", \"\\\\n d\\\\n e\\\\n f\\\\n g\\\\n c\", \n\"\\\\n c\\\\n d\\\\n e\\\\n f\\\\n g\")",
"c(\"\\\\n d\\\\n e\\\\n f\\\\n g\\\\n c\", \"\\\\n c\\\\n d\\\\n e\\\\n f\\\\n g\", \n\"\\\\n d\\\\n e\\\\n f\\\\n g\\\\n c\", \"\\\\n e\\\\n f\\\\n g\\\\n c\", \n\"\\\\n c\\\\n d\\\\n e\\\\n f\\\\n g\\\\n b\", \n\"\\\\n b\\\\n c\\\\n d\\\\n e\\\\n f\\\\n g\", \n\"\\\\n d\\\\n e\\\\n f\\\\n g\\\\n b\", \"\\\\n c\\\\n d\\\\n e\\\\n f\\\\n g\\\\n b\"\n)",
"c(\"\\\\n \", \"\\\\n \", \"\\\\n \", \"\\\\n \", \"\\\\n \", \"\\\\n \"\n)"
), number_of_accountants = c("7\\n\\n", "1 d\\n e\\n g\\n c g\\n f f\\n c\\n e\\n d\\n\\n CA CR\\n",
"5 d\\n c g\\n g\\n e\\n f c\\n e\\n f\\n d\\n\\n CA CR\\n",
"3\\n\\n"), firm_new = c("\\n\\n WC, P.C.\\n\\n", "\\n\\n \\\"John Company, PC\\n\\n",
"\\n\\n BM Group, Ltd LLP\\n\\n", "\\n\\n Continuous LLC\\n\\n"
)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
CodePudding user response:
I took the liberty to clean more than you asked but I am not sure if it helps or makes it more difficult to understand the text.
library(dplyr)
df %>%
#Remove "\\n", "c()" along with leading and trailing commas
mutate(across(.fns = ~trimws(gsub('["\\nc()]', '', .), whitespace = "[ \t\r\n,]")),
#Replace more than 2 spaces with a single space.
across(.fns = ~gsub('\\s ', ' ', .)))
# firm issuer_name num number_of_accoun… firm_new
# <chr> <chr> <chr> <chr> <chr>
#1 X, P.C. "New Cotiet Ltd. , FellCorp. , Chai … "1641398 , 1659207 , 1641398 , 1… 7 WC, P.C.
#2 Y & Comp… "" "d e f g, d e f g , d e f g" 1 d e g g f f e … Joh Compa…
#3 NGroup, … "" "d e f g , d e f g, d e f g , e … 5 d g g e f e f … BM Group,…
#4 247 tig,… "" "" 3 Cotiuous …