Home > Net >  principal component analysis (PCA) on panel data in R using prcomp
principal component analysis (PCA) on panel data in R using prcomp

Time:03-09

I'm working with a large cross-country panel dataset. I would like to apply Principal Component Analysis for each country in my sample. As I understand it, the prcomp function does not work on panel data frames directly. I could create a subset for each country and then use the prcomp function as follows:

df <- df %>%   filter(country=="Argentina")  
df <- na.omit(df) 
PCA <- prcomp(df[-1], scale=TRUE) #PCA
df2 <- cbind(df, PC1=PCA$x[,1]) #combine data 

The problem with this approach is that I'm working with a large panel of countries. I would like to find an efficient way of applying PCA to each country (using loops, dplyr, apply, or something else). Any lead would be appreciated! Here is a snapshot of my data:

structure(list(year = c(1993, 1994, 1995, 1996, 1997, 1998, 1999, 
2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 
2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 1989, 1990, 
1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 
2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 
2013, 2014, 2015, 2016, 2017, 2018, 2019), C1 = c(5.13682648211779, 
5.08266800031075, 5.351464616273, 5.61086611323361, 5.22197210944516, 
5.25272256987622, 4.45782610229152, 4.67991094417297, 4.56858960891786, 
10.2066467144411, 10.328166939117, 8.36248661002826, 7.93875427420938, 
6.93557229126392, 7.45235394606923, 7.3189545972436, 5.2736234689014, 
7.13216745078964, 6.99873377022519, 5.7817442068501, 6.05291843670451, 
6.71270351428559, 5.1566859021408, 6.26456582010254, 5.23162237725058, 
4.53787889681146, 5.11101651115706, 16.5859815004054, 17.4762288321229, 
15.2470755823245, 14.9836651520805, 15.4160185614868, 15.4628347718716, 
15.6932030829778, 16.8545560944976, 14.4611526932527, 12.4503166318037, 
10.4578259441456, 10.0328526543258, 8.78803796915921, 10.1907585574088, 
9.79514977798378, 9.32742275784367, 9.16534055228687, 8.09488128822859, 
7.45702743708227, 7.42466846883692, 8.07198879989725, 8.9669281343332, 
8.17395626522538, 7.69487759202317, 6.67610812297851, 6.55723547339657, 
6.86837534505832, 6.14229206403875, 6.03862979568919, 5.77601222087928, 
6.40077492846908), C2 = c(18.2399115325496, 17.8190917106899, 
17.2467521148076, 17.5357232920479, 18.227905749866, 17.8379584760908, 
16.9615250614589, 16.4942719439838, 16.0932258763829, 20.347773913878, 
22.4867505875749, 18.9370136214371, 18.340415936715, 17.8777938558849, 
17.0474154518997, 16.5383660984547, 15.5837772738051, 15.8448608064195, 
15.8506983942663, 15.2168009115954, 15.0110542553443, 14.7727785491821, 
14.1815854549835, 13.4880259708051, 12.8351683415363, 14.0601434041581, 
14.469857210653, 23.1218538833796, 21.9578069720618, 22.1719176235394, 
21.6370883362235, 20.8090845243861, 22.0629117902148, 22.5660651401301, 
21.1414222999772, 21.5655846462854, 22.2716554368656, 19.9963040675879, 
18.7056269713129, 17.7146298488846, 16.9326153021282, 17.1114469831801, 
16.9482350609586, 16.903863698293, 17.0593739972631, 16.8046678617533, 
16.2646010673369, 15.1576059052686, 15.0539685698439, 16.4486710311014, 
15.8337489004359, 16.2798557387916, 16.7716970232894, 16.6961854458277, 
16.5954578952398, 17.5918310459266, 19.0433777374516, 18.280976425027
), C3 = c(82.5125543268366, 88.4834748372495, 64.328775268034, 
67.295938371345, 77.7236906437735, 81.2531966123609, 73.903671516043, 
63.2796145278544, 49.6590053859052, 24.2929772699524, 32.0464542409784, 
45.2763649379163, 49.6805395163102, 50.4269078500422, 54.2479902089021, 
53.48675458569, 45.3830010270168, 53.7777258133156, 54.2717733344258, 
51.8017865791584, 56.0976840936343, 57.6821855247763, 68.0551439256991, 
60.7549295505373, 70.3450187781642, 61.7863847377178, 47.0046920548502, 
80.7115299677774, 98.9037032210514, 97.3968931160828, 98.6684502902615, 
125.766186334077, 82.7073657385639, 109.094641464308, 116.082201273435, 
117.630112854441, 113.089104828361, 97.6533515496685, 118.083087499274, 
79.487572866742, 87.5239813988531, 87.5860747310278, 94.906199990454, 
94.7033145531204, 94.5718715539841, 91.6814234437438, 81.2978732632131, 
74.034860714713, 87.6101498446379, 94.5147405516283, 80.1097244213394, 
91.3450549864493, 82.8358949845043, 81.6674625026796, 85.0041947243795, 
89.2649309430369, 75.407517473589, 65.7686639087196)), row.names = c(NA, 
-58L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`1` = 1L, 
`2` = 2L, `3` = 3L, `4` = 4L, `5` = 5L, `6` = 6L, `7` = 7L, `8` = 8L, 
`9` = 9L, `10` = 10L, `11` = 11L, `12` = 12L, `13` = 13L, `14` = 14L, 
`15` = 15L, `16` = 16L, `17` = 17L, `18` = 18L, `19` = 19L, `20` = 20L, 
`21` = 21L, `22` = 22L, `23` = 23L, `24` = 24L, `25` = 25L, `26` = 26L, 
`27` = 27L, `28` = 28L, `29` = 29L, `30` = 30L, `31` = 31L, `32` = 32L, 
`33` = 33L, `34` = 34L, `35` = 35L, `36` = 36L, `37` = 37L, `38` = 38L, 
`39` = 39L, `40` = 40L, `41` = 41L, `42` = 42L, `43` = 43L, `71` = 71L, 
`72` = 72L, `73` = 73L, `74` = 74L, `75` = 75L, `76` = 76L, `77` = 77L, 
`78` = 78L, `79` = 79L, `80` = 80L, `81` = 81L, `82` = 82L, `83` = 83L, 
`84` = 84L, `85` = 85L, `86` = 86L, `87` = 87L, `88` = 88L, `89` = 89L, 
`90` = 90L, `91` = 91L, `92` = 92L, `93` = 93L, `94` = 94L, `95` = 95L, 
`96` = 96L, `97` = 97L, `98` = 98L, `99` = 99L, `100` = 100L, 
`101` = 101L, `102` = 102L, `103` = 103L, `104` = 104L, `105` = 105L, 
`106` = 106L, `107` = 107L, `108` = 108L, `109` = 109L, `110` = 110L, 
`142` = 142L), class = "omit"))
> df <- datasetALL%>%   filter(country=="Argentina" | country=="Turkey") %>% 
    filter(year>=2008 | year <2010) %>% 
    select(year, C1=agr_GDP, C2=manu_GDP, C3=intcapimp_X) 
> dput(df)
structure(list(year = c(1950, 1951, 1952, 1953, 1954, 1955, 1956, 
1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 
1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 
1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 
1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 
2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 
2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 1950, 1951, 
1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 
1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 
1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 
1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 
1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 
2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 
2018, 2019, 2020), country = c("Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina", 
"Argentina", "Argentina", "Argentina", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey"), C1 = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 12.9041629886915, 
10.2787574725073, 9.98038631665412, 9.72127698537181, 9.18580964838541, 
9.6375609641048, 10.8658823843399, 10.984869871837, 11.9525224881086, 
10.2308377488651, 6.5839104917193, 8.15217071220236, 8.08714211405919, 
7.50442747786782, 7.79627735903041, 6.35350500480657, 6.47552296064522, 
9.59615604621897, 8.65832496118367, 8.3459768371011, 7.63430725730443, 
7.80040264019792, 8.09435846444628, 8.97786821775225, 9.61606509643166, 
8.12367620787488, 6.71649176009431, 5.99078709147934, 5.13682648211779, 
5.08266800031075, 5.351464616273, 5.61086611323361, 5.22197210944516, 
5.25272256987622, 4.45782610229152, 4.67991094417297, 4.56858960891786, 
10.2066467144411, 10.328166939117, 8.36248661002826, 7.93875427420938, 
6.93557229126392, 7.45235394606923, 7.3189545972436, 5.2736234689014, 
7.13216745078964, 6.99873377022519, 5.7817442068501, 6.05291843670451, 
6.71270351428559, 5.1566859021408, 6.26456582010254, 5.23162237725058, 
4.53787889681146, 5.11101651115706, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 54.9192364170338, 51.7385257301808, 52.8019925280199, 
53.3261802575107, 50.4970178926441, 46.1467038068709, 47.2025216706068, 
44.53125, 41.5873015873016, 40.8675799086758, 39.0839694656489, 
37.360824742268, 34.3133863714977, 33.956043956044, 36.0024203307785, 
35.3707725721378, 32.519436345967, 31.4399318375462, 31.803527403755, 
27.9075627227242, 26.1461400221772, 24.1627642070624, 22.3537484988849, 
20.9150138791008, 21.2014310977356, 19.6905586186263, 19.5126009949236, 
17.8184708900603, 17.258956312464, 16.5859815004054, 17.4762288321229, 
15.2470755823245, 14.9836651520805, 15.4160185614868, 15.4628347718716, 
15.6932030829778, 16.8545560944976, 14.4611526932527, 12.4503166318037, 
10.4578259441456, 10.0328526543258, 8.78803796915921, 10.1907585574088, 
9.79514977798378, 9.32742275784367, 9.16534055228687, 8.09488128822859, 
7.45702743708227, 7.42466846883692, 8.07198879989725, 8.9669281343332, 
8.17395626522538, 7.69487759202317, 6.67610812297851, 6.55723547339657, 
6.86837534505832, 6.14229206403875, 6.03862979568919, 5.77601222087928, 
6.40077492846908, NA), C2 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, 41.184160179361, 41.0085848893582, 
37.8240272440343, 36.1962188354532, 36.6454303394205, 31.5447436578601, 
35.762858969727, 36.358073530149, 34.3339746673214, 34.0368852105037, 
38.2151082839456, 39.1167603278579, 36.5949867349342, 34.1276092101854, 
32.6851576731524, 29.4760969895125, 28.7652362921224, 31.3602562635854, 
30.6694675312814, 29.7084408031154, 29.6418473138549, 27.4195971594836, 
27.4966462800495, 27.9996758567287, 30.9493718883737, 26.7897045521809, 
24.3864490932335, 21.8591315586603, 18.2399115325496, 17.8190917106899, 
17.2467521148076, 17.5357232920479, 18.227905749866, 17.8379584760908, 
16.9615250614589, 16.4942719439838, 16.0932258763829, 20.347773913878, 
22.4867505875749, 18.9370136214371, 18.340415936715, 17.8777938558849, 
17.0474154518997, 16.5383660984547, 15.5837772738051, 15.8448608064195, 
15.8506983942663, 15.2168009115954, 15.0110542553443, 14.7727785491821, 
14.1815854549835, 13.4880259708051, 12.8351683415363, 14.0601434041581, 
14.469857210653, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
12.7753303964758, 13.2127955493741, 13.0759651307597, 12.8755364806867, 
13.3200795228628, 14.6703806870938, 14.736012608353, 14.9857954545455, 
15.7460317460317, 16.0958904109589, 16.0814249363868, 16.6185567010309, 
17.3642338291249, 17.5549450549451, 16.5994352561517, 16.568414520633, 
17.6384839650146, 17.286755656537, 17.0617611732726, 19.1605773730247, 
17.0859939586281, 19.4393114795595, 19.9996187644155, 19.1035395302679, 
18.0569256856586, 18.2547620065251, 22.2007740904828, 21.8391127634602, 
22.9555540938444, 23.1218538833796, 21.9578069720618, 22.1719176235394, 
21.6370883362235, 20.8090845243861, 22.0629117902148, 22.5660651401301, 
21.1414222999772, 21.5655846462854, 22.2716554368656, 19.9963040675879, 
18.7056269713129, 17.7146298488846, 16.9326153021282, 17.1114469831801, 
16.9482350609586, 16.903863698293, 17.0593739972631, 16.8046678617533, 
16.2646010673369, 15.1576059052686, 15.0539685698439, 16.4486710311014, 
15.8337489004359, 16.2798557387916, 16.7716970232894, 16.6961854458277, 
16.5954578952398, 17.5918310459266, 19.0433777374516, 18.280976425027, 
NA), C3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 82.5125543268366, 
88.4834748372495, 64.328775268034, 67.295938371345, 77.7236906437735, 
81.2531966123609, 73.903671516043, 63.2796145278544, 49.6590053859052, 
24.2929772699524, 32.0464542409784, 45.2763649379163, 49.6805395163102, 
50.4269078500422, 54.2479902089021, 53.48675458569, 45.3830010270168, 
53.7777258133156, 54.2717733344258, 51.8017865791584, 56.0976840936343, 
57.6821855247763, 68.0551439256991, 60.7549295505373, 70.3450187781642, 
61.7863847377178, 47.0046920548502, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, 80.7115299677774, 98.9037032210514, 97.3968931160828, 98.6684502902615, 
125.766186334077, 82.7073657385639, 109.094641464308, 116.082201273435, 
117.630112854441, 113.089104828361, 97.6533515496685, 118.083087499274, 
79.487572866742, 87.5239813988531, 87.5860747310278, 94.906199990454, 
94.7033145531204, 94.5718715539841, 91.6814234437438, 81.2978732632131, 
74.034860714713, 87.6101498446379, 94.5147405516283, 80.1097244213394, 
91.3450549864493, 82.8358949845043, 81.6674625026796, 85.0041947243795, 
89.2649309430369, 75.407517473589, 65.7686639087196, NA)), row.names = c(NA, 
-142L), class = c("tbl_df", "tbl", "data.frame"))

CodePudding user response:

Your dataset is a bit incomplete as it is missing the country column. But I'll try to provide some pseudo-ish-code:

library(dplyr)
library(purrr)
# First split the data frame so each country becomes one df
split_df <- split.data.frame(df, df$country)
# Next, define a function that runs prcomp on each df
prcomp_wrapper <- function(df){
df %>%
  na.omit() %>%
  select(-c(year, country)) %>%
  prcomp(scale=TRUE)
}
# Run the wrapper for each country with purrr::map
PCA <- map(split_df, prcomp_wrapper)
# PCA will be a list with the PCA results for each country
  • Related