I am trying to merge two files together based on dates: However, there are two issues:
- I cannot use the left join function due to one of the files having the dates set as characters, and the other one set as dates. Changing one of the files values to dates does not fix the issue as it creates a list instead
- Using the merge function leads to a lot of missing values, as one of my dataset does not contain data for each date
jointdataset <- merge(group_df, group_tweet, by = 'date', all.x= TRUE)
View(jointdataset)
jointdataset <- dplyr::left_join(group_df, group_tweet)
Here is how my data looks like
> dput(group_tweet)
structure(list(date = structure(c(18628, 18629, 18630, 18631,
18632, 18633, 18634, 18635, 18636, 18637, 18638, 18639, 18640,
18641, 18642, 18643, 18644, 18645, 18646, 18647, 18648, 18649,
18650, 18651, 18652, 18653, 18654, 18655, 18656, 18657, 18658,
18659, 18660, 18661, 18662, 18663, 18664, 18665, 18666, 18667,
18668, 18669, 18670, 18671, 18672, 18673, 18674, 18675, 18676,
18677, 18678, 18679, 18680, 18681, 18682, 18683, 18684, 18685,
18686, 18687, 18688, 18689, 18690, 18691, 18692, 18693, 18694,
18695, 18696, 18697, 18698, 18699, 18700, 18701, 18702, 18703,
18704, 18705, 18706, 18707, 18708, 18709, 18710, 18711, 18712,
18713, 18714, 18715, 18716, 18717, 18718, 18719, 18720, 18721,
18722, 18723, 18724, 18725, 18726, 18727, 18728, 18729, 18730,
18731, 18732, 18733, 18734, 18735, 18736, 18737, 18738, 18739,
18740, 18741, 18742, 18743, 18744, 18745, 18746, 18747, 18748,
18749, 18750, 18751, 18752, 18753, 18754, 18755, 18756, 18757,
18758, 18759, 18760, 18761, 18762, 18763, 18764, 18765, 18766,
18767, 18768, 18769, 18770, 18771, 18772, 18773, 18774, 18775,
18776, 18777, 18778, 18779, 18780, 18781, 18782, 18783, 18784,
18785, 18786, 18787, 18788, 18789, 18790, 18791, 18792, 18793,
18794, 18795, 18796, 18797, 18798, 18799, 18800, 18801, 18802,
18803, 18804, 18805, 18806, 18807, 18808), class = "Date"), `length(text)` = c(1324L,
1548L, 1297L, 1585L, 1636L, 1583L, 1492L, 1676L, 1745L, 1389L,
1718L, 1781L, 1858L, 1798L, 1714L, 1808L, 1315L, 1644L, 1603L,
1820L, 1770L, 1843L, 1885L, 1390L, 1763L, 1875L, 1912L, 1812L,
1764L, 2117L, 1641L, 1914L, 1963L, 2092L, 1968L, 2021L, 2023L,
1331L, 1557L, 1555L, 1904L, 2034L, 1850L, 2067L, 1507L, 1738L,
1915L, 2057L, 1931L, 1859L, 2010L, 1525L, 1835L, 1850L, 1958L,
1848L, 1929L, 2015L, 1449L, 1823L, 1796L, 1902L, 1888L, 1875L,
2078L, 1442L, 1757L, 1877L, 2104L, 1926L, 1949L, 2175L, 1604L,
2030L, 1918L, 2011L, 1978L, 1959L, 2171L, 1528L, 1936L, 1889L,
2132L, 1907L, 2173L, 2233L, 1581L, 1793L, 1986L, 2128L, 2030L,
1805L, 1954L, 1459L, 1691L, 1967L, 2049L, 1835L, 1948L, 2246L,
1581L, 1950L, 1904L, 2245L, 2053L, 1877L, 1913L, 1571L, 1932L,
2004L, 2058L, 2087L, 1989L, 2180L, 1567L, 1865L, 1995L, 2144L,
2169L, 2148L, 2318L, 1606L, 1856L, 1948L, 2036L, 1887L, 2021L,
2132L, 1390L, 1717L, 1872L, 1919L, 1867L, 1994L, 2083L, 1509L,
1786L, 1808L, 1860L, 1854L, 1813L, 2102L, 1513L, 1890L, 1877L,
2063L, 1857L, 1827L, 2059L, 1413L, 1614L, 2153L, 1859L, 1920L,
1877L, 2106L, 1458L, 1822L, 1851L, 2005L, 1984L, 2097L, 2396L,
1607L, 2106L, 2256L, 2398L, 2245L, 2239L, 2287L, 1564L, 1880L,
1991L, 2053L, 2017L, 2012L, 1998L, 1361L, 1663L, 1778L, 1987L
)), row.names = c(NA, -181L), class = c("tbl_df", "tbl", "data.frame"
))
dput(group_df)
structure(list(date = c("01/01/2021", "01/05/2021", "01/06/2021",
"01/07/2021", "01/11/2021", "01/12/2021", "01/14/2021", "01/15/2021",
"01/18/2021", "01/19/2021", "01/20/2021", "01/21/2021", "01/22/2021",
"01/23/2021", "01/26/2021", "01/27/2021", "01/28/2021", "01/29/2021",
"01/31/2021", "02/01/2021", "02/02/2021", "02/04/2021", "02/08/2021",
"02/09/2021", "02/10/2021", "02/11/2021", "02/12/2021", "02/15/2021",
"02/16/2021", "02/18/2021", "02/19/2021", "02/22/2021", "02/23/2021",
"02/24/2021", "02/25/2021", "02/27/2021", "03/01/2021", "03/02/2021",
"03/04/2021", "03/05/2021", "03/06/2021", "03/07/2021", "03/08/2021",
"03/09/2021", "03/11/2021", "03/14/2021", "03/15/2021", "03/16/2021",
"03/17/2021", "03/18/2021", "03/19/2021", "03/20/2021", "03/21/2021",
"03/22/2021", "03/23/2021", "03/24/2021", "03/25/2021", "03/26/2021",
"03/28/2021", "03/29/2021", "03/30/2021", "03/31/2021", "04/01/2021",
"04/02/2021", "04/03/2021", "04/04/2021", "04/05/2021", "04/06/2021",
"04/07/2021", "04/08/2021", "04/09/2021", "04/10/2021", "04/11/2021",
"04/12/2021", "04/13/2021", "04/14/2021", "04/15/2021", "04/16/2021",
"04/17/2021", "04/18/2021", "04/19/2021", "04/20/2021", "04/21/2021",
"04/22/2021", "04/23/2021", "04/24/2021", "04/26/2021", "04/27/2021",
"04/28/2021", "04/29/2021", "04/30/2021", "05/01/2021", "05/02/2021",
"05/03/2021", "05/04/2021", "05/05/2021", "05/06/2021", "05/07/2021",
"05/08/2021", "05/10/2021", "05/11/2021", "05/12/2021", "05/13/2021",
"05/14/2021", "05/15/2021", "05/18/2021", "05/19/2021", "05/20/2021",
"05/21/2021", "05/22/2021", "05/23/2021", "05/25/2021", "05/26/2021",
"05/27/2021", "05/29/2021", "05/31/2021", "06/01/2021", "06/02/2021",
"06/03/2021", "06/04/2021", "06/05/2021", "06/06/2021", "06/07/2021",
"06/08/2021", "06/09/2021", "06/10/2021", "06/11/2021", "06/12/2021",
"06/13/2021", "06/14/2021", "06/15/2021", "06/16/2021", "06/17/2021"
), `length(category)` = c(4L, 8L, 4L, 4L, 4L, 12L, 8L, 8L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 8L, 9L, 25L, 10L, 5L, 10L,
5L, 20L, 5L, 5L, 19L, 10L, 20L, 5L, 5L, 5L, 2L, 12L, 22L, 14L,
2L, 4L, 6L, 9L, 5L, 37L, 5L, 4L, 10L, 12L, 21L, 4L, 4L, 6L, 6L,
9L, 16L, 9L, 6L, 13L, 2L, 19L, 24L, 20L, 8L, 25L, 16L, 5L, 24L,
27L, 24L, 6L, 9L, 15L, 8L, 25L, 35L, 26L, 8L, 2L, 5L, 14L, 12L,
20L, 19L, 7L, 9L, 5L, 4L, 17L, 13L, 15L, 8L, 10L, 7L, 5L, 16L,
11L, 16L, 10L, 6L, 3L, 22L, 10L, 10L, 3L, 1L, 7L, 4L, 6L, 4L,
5L, 3L, 6L, 3L, 3L, 1L, 3L, 9L, 6L, 4L, 2L, 1L, 1L, 3L, 5L, 2L,
4L, 2L, 2L, 1L, 5L, 5L, 6L, 3L), `sum(usd_pledged)` = c(50278.64,
366279.590415302, 172073.0471292, 230.537553792, 304353.5676352,
285277.861423738, 931521.92, 62322.104033708, 292739.37663744,
97895.025306156, 20538.4758468, 5716, 69712, 15248.7519561728,
1257502.99126928, 102268, 32151.762183708, 70322.17520884, 27549.00453216,
8371657.96195552, 1420782.4068818, 137818.171860595, 1175436.75496273,
1770825.83285245, 1070, 178761.8127624, 10016.918409372, 146129.2,
610608.234267955, 676175.367448825, 1274147.86429578, 45595.6660422,
1776940, 4198.748196, 17.02446963, 55272.18380506, 490750.12364435,
571800.5, 227.26420984, 619.23335154, 5942.151506976, 1098.48977709,
87300, 1879564.14902818, 57100, 3.628861728, 715993, 228885.860968739,
1005412.33040269, 32978.39955816, 1956.52590528, 5789.84841572,
508.6539266268, 192238.643979976, 278988.70418106, 32470.60072344,
10653.5543364, 333900.00289616, 2467.35065664, 698082.326436438,
581461.49769354, 170032.27513805, 24516.65281874, 11530.3738156855,
20060.6125168384, 154030.2061095, 215919.1704622, 1320696.42919177,
2000470.11990896, 1414.108082664, 12429.5108974052, 80676.567104964,
3211.42501648, 69994.39317561, 827188.797715076, 51349.427891072,
47925.216359587, 42, 28391.25545206, 6199141.75469484, 16078.9170341546,
724307.969231123, 238317.166592813, 243887.2812338, 3856.910710253,
5982.1359855268, 1986.4520326, 180186.734055936, 25818.301703542,
284175.24917946, 27486.5134227176, 51837.3569258, 218101.634171675,
543.2343820104, 299634.97422679, 9200.1639420603, 112660.244016819,
27675.2965010449, 5203.03118806, 75650.25, 327968.549088524,
1002.27102748, 37881.2795828444, 2896.1824198269, 596, 206880.064933601,
1688.14367074, 19176.1722666628, 48811.21823157, 4572.83962465,
375.36475346, 126743.686979946, 489.0998, 4042.91231235, 1343.0098244,
2286.711086, 77485.7681307, 25272.4433349128, 11630.0007860946,
377.22065991, 199.9004139, 161, 2667.93172391, 69404.71338983,
23796.67811208, 17174.64072667, 9950.0416377665, 35443.95356773,
184.603426004, 85934.0997877056, 5603.88739935, 2306.02823117,
13.0064795336), `sum(backers_count)` = c(2880L, 6588L, 3528L,
16L, 4204L, 6632L, 15404L, 1672L, 3588L, 416L, 464L, 364L, 1488L,
228L, 17124L, 2284L, 1348L, 2324L, 744L, 185822L, 35095L, 5980L,
24615L, 32525L, 40L, 4650L, 450L, 2640L, 15952L, 9275L, 16404L,
990L, 35075L, 79L, 6L, 1578L, 10705L, 9302L, 12L, 20L, 36L, 91L,
1505L, 40509L, 1890L, 4L, 9684L, 4196L, 16477L, 754L, 62L, 54L,
36L, 3787L, 3809L, 996L, 132L, 13216L, 50L, 12073L, 13826L, 4353L,
220L, 393L, 628L, 3607L, 6712L, 22403L, 30468L, 106L, 136L, 1854L,
102L, 1623L, 14638L, 1396L, 1923L, 6L, 426L, 161556L, 1020L,
6922L, 3575L, 1627L, 84L, 101L, 56L, 2291L, 450L, 2439L, 678L,
864L, 3195L, 17L, 6072L, 215L, 3119L, 1473L, 85L, 2306L, 7622L,
41L, 973L, 117L, 20L, 3127L, 45L, 424L, 819L, 84L, 12L, 3239L,
18L, 66L, 7L, 35L, 1651L, 651L, 242L, 25L, 13L, 6L, 59L, 1747L,
325L, 244L, 231L, 862L, 6L, 1819L, 74L, 66L, 4L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -133L))
dput(group_df)
structure(list(date = c("01/01/2021", "01/05/2021", "01/06/2021",
"01/07/2021", "01/11/2021", "01/12/2021", "01/14/2021", "01/15/2021",
"01/18/2021", "01/19/2021", "01/20/2021", "01/21/2021", "01/22/2021",
"01/23/2021", "01/26/2021", "01/27/2021", "01/28/2021", "01/29/2021",
"01/31/2021", "02/01/2021", "02/02/2021", "02/04/2021", "02/08/2021",
"02/09/2021", "02/10/2021", "02/11/2021", "02/12/2021", "02/15/2021",
"02/16/2021", "02/18/2021", "02/19/2021", "02/22/2021", "02/23/2021",
"02/24/2021", "02/25/2021", "02/27/2021", "03/01/2021", "03/02/2021",
"03/04/2021", "03/05/2021", "03/06/2021", "03/07/2021", "03/08/2021",
"03/09/2021", "03/11/2021", "03/14/2021", "03/15/2021", "03/16/2021",
"03/17/2021", "03/18/2021", "03/19/2021", "03/20/2021", "03/21/2021",
"03/22/2021", "03/23/2021", "03/24/2021", "03/25/2021", "03/26/2021",
"03/28/2021", "03/29/2021", "03/30/2021", "03/31/2021", "04/01/2021",
"04/02/2021", "04/03/2021", "04/04/2021", "04/05/2021", "04/06/2021",
"04/07/2021", "04/08/2021", "04/09/2021", "04/10/2021", "04/11/2021",
"04/12/2021", "04/13/2021", "04/14/2021", "04/15/2021", "04/16/2021",
"04/17/2021", "04/18/2021", "04/19/2021", "04/20/2021", "04/21/2021",
"04/22/2021", "04/23/2021", "04/24/2021", "04/26/2021", "04/27/2021",
"04/28/2021", "04/29/2021", "04/30/2021", "05/01/2021", "05/02/2021",
"05/03/2021", "05/04/2021", "05/05/2021", "05/06/2021", "05/07/2021",
"05/08/2021", "05/10/2021", "05/11/2021", "05/12/2021", "05/13/2021",
"05/14/2021", "05/15/2021", "05/18/2021", "05/19/2021", "05/20/2021",
"05/21/2021", "05/22/2021", "05/23/2021", "05/25/2021", "05/26/2021",
"05/27/2021", "05/29/2021", "05/31/2021", "06/01/2021", "06/02/2021",
"06/03/2021", "06/04/2021", "06/05/2021", "06/06/2021", "06/07/2021",
"06/08/2021", "06/09/2021", "06/10/2021", "06/11/2021", "06/12/2021",
"06/13/2021", "06/14/2021", "06/15/2021", "06/16/2021", "06/17/2021"
), `length(category)` = c(4L, 8L, 4L, 4L, 4L, 12L, 8L, 8L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 8L, 9L, 25L, 10L, 5L, 10L,
5L, 20L, 5L, 5L, 19L, 10L, 20L, 5L, 5L, 5L, 2L, 12L, 22L, 14L,
2L, 4L, 6L, 9L, 5L, 37L, 5L, 4L, 10L, 12L, 21L, 4L, 4L, 6L, 6L,
9L, 16L, 9L, 6L, 13L, 2L, 19L, 24L, 20L, 8L, 25L, 16L, 5L, 24L,
27L, 24L, 6L, 9L, 15L, 8L, 25L, 35L, 26L, 8L, 2L, 5L, 14L, 12L,
20L, 19L, 7L, 9L, 5L, 4L, 17L, 13L, 15L, 8L, 10L, 7L, 5L, 16L,
11L, 16L, 10L, 6L, 3L, 22L, 10L, 10L, 3L, 1L, 7L, 4L, 6L, 4L,
5L, 3L, 6L, 3L, 3L, 1L, 3L, 9L, 6L, 4L, 2L, 1L, 1L, 3L, 5L, 2L,
4L, 2L, 2L, 1L, 5L, 5L, 6L, 3L), `sum(usd_pledged)` = c(50278.64,
366279.590415302, 172073.0471292, 230.537553792, 304353.5676352,
285277.861423738, 931521.92, 62322.104033708, 292739.37663744,
97895.025306156, 20538.4758468, 5716, 69712, 15248.7519561728,
1257502.99126928, 102268, 32151.762183708, 70322.17520884, 27549.00453216,
8371657.96195552, 1420782.4068818, 137818.171860595, 1175436.75496273,
1770825.83285245, 1070, 178761.8127624, 10016.918409372, 146129.2,
610608.234267955, 676175.367448825, 1274147.86429578, 45595.6660422,
1776940, 4198.748196, 17.02446963, 55272.18380506, 490750.12364435,
571800.5, 227.26420984, 619.23335154, 5942.151506976, 1098.48977709,
87300, 1879564.14902818, 57100, 3.628861728, 715993, 228885.860968739,
1005412.33040269, 32978.39955816, 1956.52590528, 5789.84841572,
508.6539266268, 192238.643979976, 278988.70418106, 32470.60072344,
10653.5543364, 333900.00289616, 2467.35065664, 698082.326436438,
581461.49769354, 170032.27513805, 24516.65281874, 11530.3738156855,
20060.6125168384, 154030.2061095, 215919.1704622, 1320696.42919177,
2000470.11990896, 1414.108082664, 12429.5108974052, 80676.567104964,
3211.42501648, 69994.39317561, 827188.797715076, 51349.427891072,
47925.216359587, 42, 28391.25545206, 6199141.75469484, 16078.9170341546,
724307.969231123, 238317.166592813, 243887.2812338, 3856.910710253,
5982.1359855268, 1986.4520326, 180186.734055936, 25818.301703542,
284175.24917946, 27486.5134227176, 51837.3569258, 218101.634171675,
543.2343820104, 299634.97422679, 9200.1639420603, 112660.244016819,
27675.2965010449, 5203.03118806, 75650.25, 327968.549088524,
1002.27102748, 37881.2795828444, 2896.1824198269, 596, 206880.064933601,
1688.14367074, 19176.1722666628, 48811.21823157, 4572.83962465,
375.36475346, 126743.686979946, 489.0998, 4042.91231235, 1343.0098244,
2286.711086, 77485.7681307, 25272.4433349128, 11630.0007860946,
377.22065991, 199.9004139, 161, 2667.93172391, 69404.71338983,
23796.67811208, 17174.64072667, 9950.0416377665, 35443.95356773,
184.603426004, 85934.0997877056, 5603.88739935, 2306.02823117,
13.0064795336), `sum(backers_count)` = c(2880L, 6588L, 3528L,
16L, 4204L, 6632L, 15404L, 1672L, 3588L, 416L, 464L, 364L, 1488L,
228L, 17124L, 2284L, 1348L, 2324L, 744L, 185822L, 35095L, 5980L,
24615L, 32525L, 40L, 4650L, 450L, 2640L, 15952L, 9275L, 16404L,
990L, 35075L, 79L, 6L, 1578L, 10705L, 9302L, 12L, 20L, 36L, 91L,
1505L, 40509L, 1890L, 4L, 9684L, 4196L, 16477L, 754L, 62L, 54L,
36L, 3787L, 3809L, 996L, 132L, 13216L, 50L, 12073L, 13826L, 4353L,
220L, 393L, 628L, 3607L, 6712L, 22403L, 30468L, 106L, 136L, 1854L,
102L, 1623L, 14638L, 1396L, 1923L, 6L, 426L, 161556L, 1020L,
6922L, 3575L, 1627L, 84L, 101L, 56L, 2291L, 450L, 2439L, 678L,
864L, 3195L, 17L, 6072L, 215L, 3119L, 1473L, 85L, 2306L, 7622L,
41L, 973L, 117L, 20L, 3127L, 45L, 424L, 819L, 84L, 12L, 3239L,
18L, 66L, 7L, 35L, 1651L, 651L, 242L, 25L, 13L, 6L, 59L, 1747L,
325L, 244L, 231L, 862L, 6L, 1819L, 74L, 66L, 4L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -133L))
My final objective is to get a dataset, where i can get for each date the data, including for the days where there is missing data in one of the data sets
CodePudding user response:
I assume your key problem is changing eg 01/15/2022
to a date. You can easily use the package lubridate
s mdy()
function for this (install.packages('lubridate')
.
*adjusted from your comment.
You can run:
library(dplyr)
library(lubridate)
group_df %>%
mutate(date = mdy(date)) %>%
full_join(
group_tweet
) %>%
arrange(date)
Output is:
Joining, by = "date"
# A tibble: 181 × 5
date `length(category)` `sum(usd_pledged)` `sum(backers_count)` `length(text)`
<date> <int> <dbl> <int> <int>
1 2021-01-01 4 50279. 2880 1324
2 2021-01-02 NA NA NA 1548
3 2021-01-03 NA NA NA 1297
4 2021-01-04 NA NA NA 1585
5 2021-01-05 8 366280. 6588 1636
6 2021-01-06 4 172073. 3528 1583
7 2021-01-07 4 231. 16 1492
8 2021-01-08 NA NA NA 1676
9 2021-01-09 NA NA NA 1745
10 2021-01-10 NA NA NA 1389
# … with 171 more rows