I have 2 data frames which look like:
# A tibble: 52 × 2
provincia mean_price
<chr> <dbl>
1 A Coruña 137624.
2 Albacete 2115.
3 Alicante 31798.
col1 col2 col3 col4
1 04 Almería Almería <NA>
2 11 Cádiz Cádiz <NA>
3 14 Córdoba Córdoba <NA>
I am trying to join them together using full_join
but not all of the names are being matched correctly due to differences in the spelling. I can run the following (with corresponding output):
df1 %>%
full_join(df2, by = c("provincia" = "col2"))
# A tibble: 60 × 5
provincia mean_price col1 col3 col4
<chr> <dbl> <chr> <chr> <chr>
1 A Coruña 137624. NA NA NA
2 Albacete 2115. 02 Albacete NA
3 Alicante 31798. NA NA NA
4 Almería -210404. 04 Almería NA
So it was able to match up many of the names but it failed on a few. In another column I have different variations of the spellings:
df1 %>%
full_join(df2, by = c("provincia" = "col3")) %>%
drop_na(col4)
provincia mean_price col1 col2 col4
<chr> <dbl> <chr> <chr> <chr>
1 Lugo -134055. 27 Lugo Lugo
2 Pontevedra -374778. 36 Pontevedra Pontevedra
3 La Coruña NA 15 Coruña, A A Coruña
4 Orense NA 32 Ourense Ourense
So I have 2 data frames I want to join, in df1
the column doesn't change, its always provincias
- in df2
if the name did not match in col1
, try col2
, then try col3
and so on until all of the columns have been applied to try and full_join
the data.
Some of the problem examples are:
In df1
we have
A Coruña
Alicante
Castellón
In df2
we have:
col2 = Coruña, A
col3 = La Coruña
col4 = A Coruña # joined up using this
and
col2 = Alicante/Alacant
col3 = Alicante # joined up using this
col4 = NA
and
col2 = Castellón/Castelló
col3 = Castellón # joined up using this
col4 = NA
Expected output: Trying to create a function to map over/ loop over and try to join the data together, moving along the columns if the previous column did not join successfully.
Data
df1 = structure(list(provincia = c("A Coruña", "Albacete", "Alicante",
"Almería", "Araba - Álava", "Asturias", "Ávila", "Badajoz",
"Barcelona", "Bizkaia", "Burgos", "Cáceres", "Cádiz", "Cantabria",
"Castellón", "Ceuta", "Ciudad Real", "Córdoba", "Cuenca", "Gipuzkoa",
"Girona", "Granada", "Guadalajara", "Huelva", "Huesca", "Illes Balears",
"Jaén", "La Rioja", "Las Palmas", "León", "Lleida", "Lugo",
"Madrid", "Málaga", "Melilla", "Murcia", "Navarra", "Ourense",
"Palencia", "Pontevedra", "Salamanca", "Santa Cruz de Tenerife",
"Segovia", "Sevilla", "Soria", "Tarragona", "Teruel", "Toledo",
"Valencia", "Valladolid", "Zamora", "Zaragoza"), mean_price = c(137624.198693295,
2114.81534770306, 31797.6042278187, -210404.350690943, 677319.793678721,
-105607.758762033, -149924.778464006, -2097656.49617517, 128038.251992261,
-378733.392057684, 191990.956696032, 695.564248280345, 1428676.05256214,
3153.68145681899, 718682.972504407, -190874.713744811, 106101.705786415,
338330.550863887, 13849.7490258121, 87318.2683658272, 118338.163156024,
162218.313332224, -851960.092180063, 221206.432450402, 90652.890697791,
-8181130.3432743, 44727.9976028575, -1573959.50553556, -497997.406878741,
927365.585585463, 145165.81648469, -134054.92318867, 987306.447344823,
150960.003949561, -517183.316043251, -267412.822954099, -414282.66968961,
1013507.94117714, 589115.889523749, -374778.481273493, 108896.549765545,
-9019.89307188885, -52818.219435273, -217912.286950506, 6030463.0529272,
361426.680584978, -1069517.98520776, 135542.864155717, -236847.461222156,
156897.290850635, -80528.5521733855, 333167.648531738)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -52L))
df2 = structure(list(col1 = c("04", "11", "14", "18", "21", "23", "29",
"41", "22", "44", "50", "33", "07", "35", "38", "39", "05", "09",
"24", "34", "37", "40", "42", "47", "49", "02", "13", "16", "19",
"45", "08", "17", "25", "43", "03", "12", "46", "06", "10", "15",
"27", "32", "36", "28", "30", "31", "01", "20", "48", "26", "51",
"52"), col2 = c("Almería", "Cádiz", "Córdoba", "Granada",
"Huelva", "Jaén", "Málaga", "Sevilla", "Huesca", "Teruel",
"Zaragoza", "Asturias", "Balears, Illes", "Palmas, Las", "Santa Cruz de Tenerife",
"Cantabria", "Ávila", "Burgos", "León", "Palencia", "Salamanca",
"Segovia", "Soria", "Valladolid", "Zamora", "Albacete", "Ciudad Real",
"Cuenca", "Guadalajara", "Toledo", "Barcelona", "Girona", "Lleida",
"Tarragona", "Alicante/Alacant", "Castellón/Castelló", "Valencia/València",
"Badajoz", "Cáceres", "Coruña, A", "Lugo", "Ourense", "Pontevedra",
"Madrid", "Murcia", "Navarra", "Araba/Álava", "Gipuzkoa", "Bizkaia",
"Rioja, La", "Ceuta", "Melilla"), col3 = c("Almería", "Cádiz",
"Córdoba", "Granada", "Huelva", "Jaén", "Málaga", "Sevilla",
"Huesca", "Teruel", "Zaragoza", "Asturias", "Baleares", "Las Palmas",
"Santa Cruz de Tenerife", "Cantabria", "Ávila", "Burgos", "León",
"Palencia", "Salamanca", "Segovia", "Soria", "Valladolid", "Zamora",
"Albacete", "Ciudad Real", "Cuenca", "Guadalajara", "Toledo",
"Barcelona", "Gerona", "Lérida", "Tarragona", "Alicante", "Castellón",
"Valencia", "Badajoz", "Cáceres", "La Coruña", "Lugo", "Orense",
"Pontevedra", "Madrid", "Murcia", "Navarra", "Álava", NA, NA,
"La Rioja", "Ceuta", "Melilla"), col4 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "A Coruña", "Lugo", "Ourense", "Pontevedra", NA, NA,
NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-52L))
CodePudding user response:
We may use regex_full_join
from fuzzyjoin
library(fuzzyjoin)
regex_full_join(df1, df2, by = c("provincia" = "col3"))
-output
# A tibble: 59 × 6
provincia mean_price col1 col2 col3 col4
<chr> <dbl> <chr> <chr> <chr> <chr>
1 Albacete 2115. 02 Albacete Albacete <NA>
2 Alicante 31798. 03 Alicante/Alacant Alicante <NA>
3 Almería -210404. 04 Almería Almería <NA>
4 Araba - Álava 677320. 01 Araba/Álava Álava <NA>
5 Asturias -105608. 33 Asturias Asturias <NA>
6 Ávila -149925. 05 Ávila Ávila <NA>
7 Badajoz -2097656. 06 Badajoz Badajoz <NA>
8 Barcelona 128038. 08 Barcelona Barcelona <NA>
9 Burgos 191991. 09 Burgos Burgos <NA>
10 Cáceres 696. 10 Cáceres Cáceres <NA>
# … with 49 more rows
In case we want to do the join sequentially on each of the col2, col3, col4, here is one method with powerjoin
library(powerjoin)
library(purrr)
out <- map(names(df2)[-1],
~ regex_left_join(df1, df2, by = c("provincia" = .x))) %>%
reduce(power_left_join, by = c("provincia", "mean_price"),
conflict = coalesce_xy)
-output
> as.data.frame(out)
provincia mean_price col1 col2 col3 col4
1 A Coruña 137624.1987 15 Coruña, A La Coruña A Coruña
2 Albacete 2114.8153 02 Albacete Albacete <NA>
3 Alicante 31797.6042 03 Alicante/Alacant Alicante <NA>
4 Almería -210404.3507 04 Almería Almería <NA>
5 Araba - Álava 677319.7937 01 Araba/Álava Álava <NA>
6 Asturias -105607.7588 33 Asturias Asturias <NA>
7 Ávila -149924.7785 05 Ávila Ávila <NA>
8 Badajoz -2097656.4962 06 Badajoz Badajoz <NA>
9 Barcelona 128038.2520 08 Barcelona Barcelona <NA>
10 Bizkaia -378733.3921 48 Bizkaia <NA> <NA>
11 Burgos 191990.9567 09 Burgos Burgos <NA>
12 Cáceres 695.5642 10 Cáceres Cáceres <NA>
13 Cádiz 1428676.0526 11 Cádiz Cádiz <NA>
14 Cantabria 3153.6815 39 Cantabria Cantabria <NA>
15 Castellón 718682.9725 12 Castellón/Castelló Castellón <NA>
16 Ceuta -190874.7137 51 Ceuta Ceuta <NA>
17 Ciudad Real 106101.7058 13 Ciudad Real Ciudad Real <NA>
18 Córdoba 338330.5509 14 Córdoba Córdoba <NA>
19 Cuenca 13849.7490 16 Cuenca Cuenca <NA>
20 Gipuzkoa 87318.2684 20 Gipuzkoa <NA> <NA>
21 Girona 118338.1632 17 Girona Gerona <NA>
22 Granada 162218.3133 18 Granada Granada <NA>
23 Guadalajara -851960.0922 19 Guadalajara Guadalajara <NA>
24 Huelva 221206.4325 21 Huelva Huelva <NA>
25 Huesca 90652.8907 22 Huesca Huesca <NA>
26 Illes Balears -8181130.3433 <NA> <NA> <NA> <NA>
27 Jaén 44727.9976 23 Jaén Jaén <NA>
28 La Rioja -1573959.5055 26 Rioja, La La Rioja <NA>
29 Las Palmas -497997.4069 35 Palmas, Las Las Palmas <NA>
30 León 927365.5856 24 León León <NA>
31 Lleida 145165.8165 25 Lleida Lérida <NA>
32 Lugo -134054.9232 27 Lugo Lugo Lugo
33 Madrid 987306.4473 28 Madrid Madrid <NA>
34 Málaga 150960.0039 29 Málaga Málaga <NA>
35 Melilla -517183.3160 52 Melilla Melilla <NA>
36 Murcia -267412.8230 30 Murcia Murcia <NA>
37 Navarra -414282.6697 31 Navarra Navarra <NA>
38 Ourense 1013507.9412 32 Ourense Orense Ourense
39 Palencia 589115.8895 34 Palencia Palencia <NA>
40 Pontevedra -374778.4813 36 Pontevedra Pontevedra Pontevedra
41 Salamanca 108896.5498 37 Salamanca Salamanca <NA>
42 Santa Cruz de Tenerife -9019.8931 38 Santa Cruz de Tenerife Santa Cruz de Tenerife <NA>
43 Segovia -52818.2194 40 Segovia Segovia <NA>
44 Sevilla -217912.2870 41 Sevilla Sevilla <NA>
45 Soria 6030463.0529 42 Soria Soria <NA>
46 Tarragona 361426.6806 43 Tarragona Tarragona <NA>
47 Teruel -1069517.9852 44 Teruel Teruel <NA>
48 Toledo 135542.8642 45 Toledo Toledo <NA>
49 Valencia -236847.4612 46 Valencia/València Valencia <NA>
50 Valladolid 156897.2909 47 Valladolid Valladolid <NA>
51 Zamora -80528.5522 49 Zamora Zamora <NA>
52 Zaragoza 333167.6485 50 Zaragoza Zaragoza <NA>