I have a data set from which I need to extract another data set. I am interested in married women and their living conditions, and would like to create a data set for married women only, that would, for each woman, contain the following information:
- Her age;
- Her husband’s age;
- Her level of education;
- Her husband’s level of education;
- Number of children (regardless of the age);
- Number of children under the age of seven;
- Whether or not she lives with her mother (or her mother lives with her);
- Whether or not she lives with her father (or vice versa);
- Whether or not she lives with her father-in-law (or vice versa);
- Whether or not she lives with her mother-in-law (or vice versa).
I wrote the code below, but I would like someone to help me replace it with a faster and more elegant code using dplyr
/tidyverse
:
library(tidyverse)
library(dplyr)
library(tidyr)
library(haven)
data <- read_sav("Individus ENCDM 2014 en format SAV.sav")
married_women <- data %>% filter(Sexe==2 & Etat_matrimonial==2 & Liendeparenté==2)
child <- c() # children
c_six <- c() # children below six
h_age <- c() # husband's age
h_edu <- c() # husband's education
w_mum <- c() # wife's mum
w_dad <- c() # wife's dad
h_mum <- c() # husband's mum
h_dad <- c() # husband's dad
for (i in married_women$N_ménage) {
a <- nrow(individual[individual$N_ménage==i & individual$Liendeparenté==3,])
child <- append(child, a)
b <- individual[individual$N_ménage==i & individual$Liendeparenté==1,]$Age
h_age <- append(h_age, b)
c <- individual[individual$N_ménage==i & individual$Liendeparenté==1,]$Niveau_scolaire_agreg
h_edu <- append(h_edu, c)
d <- nrow(individual[individual$N_ménage==i & individual$Liendeparenté==7 & individual$Sexe==2,])
w_mum <- append(w_mum, d)
e <- nrow(individual[individual$N_ménage==i & individual$Liendeparenté==6 & individual$Sexe==2,])
w_dad <- append(w_dad, e)
f <- nrow(individual[individual$N_ménage==i & individual$Liendeparenté==7 & individual$Sexe==1,])
h_mum <- append(h_mum, f)
g <- nrow(individual[individual$N_ménage==i & individual$Liendeparenté==6 & individual$Sexe==1,])
h_dad <- append(h_dad, g)
h <- nrow(individual[individual$N_ménage==i & individual$Liendeparenté==3 & individual$Age<=6,])
c_six <- append(c_six, h)
}
married_women$Number_children <- child
married_women$N_chl_below_six <- c_six
married_women$Husband_age <- h_age
married_women$Husband_educati <- h_edu
married_women$Mother_of_wife <- w_mum
married_women$Mother_of_husb <- h_mum
married_women$Father_of_wife <- w_dad
married_women$Father_of_husb <- h_dad
and here is a small chunk of the data set I am using:
> dput(d_head)
structure(list(N_ménage = structure(c(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), label = "Numéro du ménage", format.spss = "F8.2"),
Nordre = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2), label = "Numéro d'ordre de l'individu", format.spss = "F2.0", display_width = 4L),
Coef_individu = structure(c(168, 178, 183, 178, 157, 183,
178, 121, 157, 160, 168, 178, 183, 178, 157, 183, 178, 121,
157, 160), label = "Coefficient individu", format.spss = "F8.2", display_width = 4L),
Milieu = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1), label = "Milieu de résidence", format.spss = "F1.0", display_width = 4L, labels = c(Urbain = 1,
Rural = 2), class = c("haven_labelled", "vctrs_vctr", "double"
)), Région_12 = structure(c(12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), label = "Régions", format.spss = "F2.0", display_width = 11L, labels = c(`Tanger-Tétouan-Al Hoceïma` = 1,
Oriental = 2, `Fès-Meknès` = 3, `Rabat-Salé-Kénitra` = 4,
`Béni Mellal-Khénifra` = 5, `Casablanca-Settat` = 6, `Marrakech-Safi` = 7,
`Drâa-Tafilalet` = 8, `Souss-Massa` = 9, `Guelmim-Oued Noun` = 10,
`Laâyoune-Sakia El Hamra` = 11, `Dakhla-Oued Ed Dahab` = 12
), class = c("haven_labelled", "vctrs_vctr", "double")),
Sexe = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2), label = "Sexe", format.spss = "F1.0", display_width = 4L, labels = c(Masculin = 1,
Féminin = 2), class = c("haven_labelled", "vctrs_vctr",
"double")), Age = structure(c(17, 31, 27, 30, 41, 28, 32,
50, 40, 39, 17, 31, 27, 30, 41, 28, 32, 50, 40, 39), label = "Age", format.spss = "F2.0", display_width = 4L),
Age_quin = structure(c(4, 7, 6, 7, 9, 6, 7, 11, 9, 8, 4,
7, 6, 7, 9, 6, 7, 11, 9, 8), label = "Age quinquenal de l'individu", format.spss = "F8.2", display_width = 10L, labels = c(`0-4` = 1,
`5-9` = 2, `10-14` = 3, `15-19` = 4, `20-24` = 5, `25-29` = 6,
`30-34` = 7, `35-39` = 8, `40-45` = 9, `45-49` = 10, `50-54` = 11,
`55-59` = 12, `60-64` = 13, `65-69` = 14, `70-74` = 15, `75 et plus` = 16
), class = c("haven_labelled", "vctrs_vctr", "double")),
Liendeparenté = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2), label = "Lien de parenté avec le chef de ménage", format.spss = "F2.0", display_width = 4L, labels = c(`Chef de ménage` = 1,
`Epoux (se) du chef de ménage` = 2, `Fils (fille) du chef de ménage` = 3,
`Epouse du fils /Epoux de la fille du chef de ménage` = 4,
`Petit fils (petite fille) du chef de ménage` = 5, `Père ou mère du chef de ménage` = 6,
`Père ou mère de l’époux (épouse) du chef de ménage` = 7,
`Frère ou sœur du chef de ménage` = 8, `Autres personne en lien de parenté avec le chef de ménage` = 9,
`Autres personne sans lien de parenté avec le chef de ménage` = 10,
`Personne travaillant pour le compte du ménage: aude ménagère, chauffeur, jardinier...` = 11
), class = c("haven_labelled", "vctrs_vctr", "double")),
Etat_matrimonial = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), label = "Etat matrimonial", format.spss = "F1.0", display_width = 15L, labels = c(Célibataire = 1,
`Marié(e)` = 2, `Divorcé(e)` = 3, `Veuf(ve)` = 4), class = c("haven_labelled",
"vctrs_vctr", "double")), Niveau_scolaire_agreg = structure(c(2,
3, 2, 5, 5, 4, 2, 4, 0, 4, 2, 3, 2, 5, 5, 3, 2, 4, 0, 4), label = "Le plus haut niveau scolaire", format.spss = "F8.2", display_width = 7L, labels = c(`Sans niveau` = 0,
Préscolaire = 1, Primaire = 2, `Secondaire collégial` = 3,
`Secondaire qualifiant` = 4, Supérieur = 5, `Autre niveau` = 6
), class = c("haven_labelled", "vctrs_vctr", "double")),
Diplôme_agregé = structure(c(1, 2, 1, 3, 3, 2, 1, 2, 1,
2, 1, 2, 1, 3, 3, 2, 1, 2, 1, 2), label = "Le plus haut diplôme", format.spss = "F4.0", display_width = 10L, labels = c(`Age moins de 10 ans` = 0,
`Sans diplôme` = 1, `Niveau moyen` = 2, `Niveau supérieur` = 3
), class = c("haven_labelled", "vctrs_vctr", "double")),
Type_activité_dominante = structure(c(4, 4, 4, 3, 1, 4,
4, 1, 4, 4, 4, 4, 4, 3, 1, 4, 4, 1, 4, 4), label = "Type d'activité habituelle durant les 12 dérniers mois", format.spss = "F2.0", display_width = 9L, labels = c(`Actif occupé` = 1,
`Chômeur ayant déjà travaillé` = 2, `Chômeur n' ayant jamais travaillé` = 3,
`Femme au foyer` = 4, `Elève, étudiant` = 5, Enfant = 6,
`Personne âgée` = 7, Retraité = 8, Rentier = 9, `Malade/infirme` = 10,
`Autre inactif` = 11, `Non déclaré` = 99), class = c("haven_labelled",
"vctrs_vctr", "double")), Profession_agreg = structure(c(NA,
NA, NA, 0, 2, NA, NA, 2, NA, NA, NA, NA, NA, 0, 2, NA, NA,
2, NA, NA), label = "Catégories professionnelles dans l'activité habituelle durant les 12 dérniers mois", format.spss = "F1.0", display_width = 16L, labels = c(`Chômeur n'ayant jamais travaillés` = 0,
`directeurs et cadres de direction, membres des corps législatifs,cadres supérieurs et membres des professions libérales` = 1,
`Cadres moyens et employés de bureau` = 2, `Commerçants, intermédiaires commerciaux et financiers` = 3,
`Exploitants et ouvriers agricoles (y compris pêche, chasse et forêt)` = 4,
`Artisans et ouvriers qualifies et conducteurs d'installations et de machines et ouvriers de l'assemblage` = 5,
`Manœuvres non agricoles, manutentionnaires et travailleurs des petits métiers` = 6,
`Non déclaré` = 9), class = c("haven_labelled", "vctrs_vctr",
"double")), Secteur_activité_agreg = structure(c(NA, NA,
NA, 0, 5, NA, NA, 5, NA, NA, NA, NA, NA, 0, 5, NA, NA, 5,
NA, NA), label = "Secteur d'activité habituelle durant les 12 dérniers mois", format.spss = "F1.0", display_width = 14L, labels = c(`Chômeur n'ayant jamais travaillé` = 0,
`Agriculture, forêt et pêhe` = 1, Industrie = 2, `Batiment et travaux publics` = 3,
Commerce = 4, Service = 5, `Non déclaré` = 9), class = c("haven_labelled",
"vctrs_vctr", "double")), Situation_profession_agreg = structure(c(1,
1, 1, 0, 2, 1, 1, 2, 1, 1, 1, 1, 1, 0, 2, 1, 1, 2, 1, 1), label = "Situation dans la profession habituelle durant les 12 dérniers mois", format.spss = "F1.0", labels = c(`Chômeur n'ayant jamais travaillé` = 0,
Inactif = 1, Salarié = 2, Indépendant = 3, Employeur = 4,
`Autres situation` = 5, `Non déclaré` = 9), class = c("haven_labelled",
"vctrs_vctr", "double")), N_Children = c(0L, 1L, 1L, 1L,
2L, 2L, 2L, 4L, 6L, 5L, 0L, 1L, 1L, 1L, 2L, 2L, 2L, 4L, 6L,
5L), Husband_age = c(26, 36, 38, 34, 42, 33, 40, 50, 49,
53, 26, 36, 38, 34, 42, 33, 40, 50, 49, 53), Husband_edu = structure(c(2,
4, 2, 5, 5, 4, 3, 4, 4, 0, 2, 4, 2, 5, 5, 4, 3, 4, 4, 0), label = "Le plus haut niveau scolaire", format.spss = "F8.2", display_width = 7L, labels = c(`Sans niveau` = 0,
Préscolaire = 1, Primaire = 2, `Secondaire collégial` = 3,
`Secondaire qualifiant` = 4, Supérieur = 5, `Autre niveau` = 6
), class = c("haven_labelled", "vctrs_vctr", "double")),
wife_mother = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Wife_mother = c(0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L), Husband_mother = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Wife_father = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Husband_father = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), Act = c(0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
0, 0, 1, 1, 0, 0, 1, 0, 0), Below_six = c(0L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 3L, 0L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
3L), Activity = c(0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Can someone please help me rewrite the code in a more concise manner. Thank you very much in advance.
PS: this is dput
for the raw data (before modification):
> dput(d_head)
structure(list(N_ménage = structure(c(1, 1, 2, 2, 2, 3, 3, 3,
4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6), label = "Numéro du ménage", format.spss = "F8.2"),
Nordre = structure(c(2, 1, 3, 2, 1, 3, 2, 1, 3, 2, 1, 4,
3, 2, 1, 4, 3, 5, 2, 1), label = "Numéro d'ordre de l'individu", format.spss = "F2.0", display_width = 4L),
Coef_individu = structure(c(168, 392, 191, 178, 307, 187,
183, 307, 191, 178, 427, 164, 162, 157, 306, 191, 164, 118,
183, 427), label = "Coefficient individu", format.spss = "F8.2", display_width = 4L),
Milieu = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1), label = "Milieu de résidence", format.spss = "F1.0", display_width = 4L, labels = c(Urbain = 1,
Rural = 2), class = c("haven_labelled", "vctrs_vctr", "double"
)), Région_12 = structure(c(12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), label = "Régions", format.spss = "F2.0", display_width = 11L, labels = c(`Tanger-Tétouan-Al Hoceïma` = 1,
Oriental = 2, `Fès-Meknès` = 3, `Rabat-Salé-Kénitra` = 4,
`Béni Mellal-Khénifra` = 5, `Casablanca-Settat` = 6, `Marrakech-Safi` = 7,
`Drâa-Tafilalet` = 8, `Souss-Massa` = 9, `Guelmim-Oued Noun` = 10,
`Laâyoune-Sakia El Hamra` = 11, `Dakhla-Oued Ed Dahab` = 12
), class = c("haven_labelled", "vctrs_vctr", "double")),
Sexe = structure(c(2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2,
2, 1, 2, 1, 2, 2, 1), label = "Sexe", format.spss = "F1.0", display_width = 4L, labels = c(Masculin = 1,
Féminin = 2), class = c("haven_labelled", "vctrs_vctr",
"double")), Age = structure(c(17, 26, 1, 31, 36, 0, 27, 38,
2, 30, 34, 6, 11, 41, 42, 2, 9, 47, 28, 33), label = "Age", format.spss = "F2.0", display_width = 4L),
Age_quin = structure(c(4, 6, 1, 7, 8, 1, 6, 8, 1, 7, 7, 2,
3, 9, 9, 1, 2, 10, 6, 7), label = "Age quinquenal de l'individu", format.spss = "F8.2", display_width = 10L, labels = c(`0-4` = 1,
`5-9` = 2, `10-14` = 3, `15-19` = 4, `20-24` = 5, `25-29` = 6,
`30-34` = 7, `35-39` = 8, `40-45` = 9, `45-49` = 10, `50-54` = 11,
`55-59` = 12, `60-64` = 13, `65-69` = 14, `70-74` = 15, `75 et plus` = 16
), class = c("haven_labelled", "vctrs_vctr", "double")),
Liendeparenté = structure(c(2, 1, 3, 2, 1, 3, 2, 1, 3, 2,
1, 3, 3, 2, 1, 3, 3, 7, 2, 1), label = "Lien de parenté avec le chef de ménage", format.spss = "F2.0", display_width = 4L, labels = c(`Chef de ménage` = 1,
`Epoux (se) du chef de ménage` = 2, `Fils (fille) du chef de ménage` = 3,
`Epouse du fils /Epoux de la fille du chef de ménage` = 4,
`Petit fils (petite fille) du chef de ménage` = 5, `Père ou mère du chef de ménage` = 6,
`Père ou mère de l’époux (épouse) du chef de ménage` = 7,
`Frère ou sœur du chef de ménage` = 8, `Autres personne en lien de parenté avec le chef de ménage` = 9,
`Autres personne sans lien de parenté avec le chef de ménage` = 10,
`Personne travaillant pour le compte du ménage: aude ménagère, chauffeur, jardinier...` = 11
), class = c("haven_labelled", "vctrs_vctr", "double")),
Etat_matrimonial = structure(c(2, 2, 1, 2, 2, 1, 2, 2, 1,
2, 2, 1, 1, 2, 2, 1, 1, 3, 2, 2), label = "Etat matrimonial", format.spss = "F1.0", display_width = 15L, labels = c(Célibataire = 1,
`Marié(e)` = 2, `Divorcé(e)` = 3, `Veuf(ve)` = 4), class = c("haven_labelled",
"vctrs_vctr", "double")), Niveau_scolaire_agreg = structure(c(2,
2, NA, 3, 4, NA, 2, 2, NA, 5, 5, 2, 2, 5, 5, NA, 2, 2, 4,
4), label = "Le plus haut niveau scolaire", format.spss = "F8.2", display_width = 7L, labels = c(`Sans niveau` = 0,
Préscolaire = 1, Primaire = 2, `Secondaire collégial` = 3,
`Secondaire qualifiant` = 4, Supérieur = 5, `Autre niveau` = 6
), class = c("haven_labelled", "vctrs_vctr", "double")),
Diplôme_agregé = structure(c(1, 2, 0, 2, 2, 0, 1, 1, 0,
3, 3, 0, 1, 3, 3, 0, 0, 1, 2, 2), label = "Le plus haut diplôme", format.spss = "F4.0", display_width = 10L, labels = c(`Age moins de 10 ans` = 0,
`Sans diplôme` = 1, `Niveau moyen` = 2, `Niveau supérieur` = 3
), class = c("haven_labelled", "vctrs_vctr", "double")),
Type_activité_dominante = structure(c(4, 1, NA, 4, 1, NA,
4, 1, NA, 3, 1, NA, 5, 1, 1, NA, 5, 1, 4, 1), label = "Type d'activité habituelle durant les 12 dérniers mois", format.spss = "F2.0", display_width = 9L, labels = c(`Actif occupé` = 1,
`Chômeur ayant déjà travaillé` = 2, `Chômeur n' ayant jamais travaillé` = 3,
`Femme au foyer` = 4, `Elève, étudiant` = 5, Enfant = 6,
`Personne âgée` = 7, Retraité = 8, Rentier = 9, `Malade/infirme` = 10,
`Autre inactif` = 11, `Non déclaré` = 99), class = c("haven_labelled",
"vctrs_vctr", "double")), Profession_agreg = structure(c(NA,
6, NA, NA, 4, NA, NA, 5, NA, 0, 2, NA, NA, 2, 4, NA, NA,
4, NA, 4), label = "Catégories professionnelles dans l'activité habituelle durant les 12 dérniers mois", format.spss = "F1.0", display_width = 16L, labels = c(`Chômeur n'ayant jamais travaillés` = 0,
`directeurs et cadres de direction, membres des corps législatifs,cadres supérieurs et membres des professions libérales` = 1,
`Cadres moyens et employés de bureau` = 2, `Commerçants, intermédiaires commerciaux et financiers` = 3,
`Exploitants et ouvriers agricoles (y compris pêche, chasse et forêt)` = 4,
`Artisans et ouvriers qualifies et conducteurs d'installations et de machines et ouvriers de l'assemblage` = 5,
`Manœuvres non agricoles, manutentionnaires et travailleurs des petits métiers` = 6,
`Non déclaré` = 9), class = c("haven_labelled", "vctrs_vctr",
"double")), Secteur_activité_agreg = structure(c(NA, 4,
NA, NA, 1, NA, NA, 3, NA, 0, 5, NA, NA, 5, 2, NA, NA, 2,
NA, 2), label = "Secteur d'activité habituelle durant les 12 dérniers mois", format.spss = "F1.0", display_width = 14L, labels = c(`Chômeur n'ayant jamais travaillé` = 0,
`Agriculture, forêt et pêhe` = 1, Industrie = 2, `Batiment et travaux publics` = 3,
Commerce = 4, Service = 5, `Non déclaré` = 9), class = c("haven_labelled",
"vctrs_vctr", "double")), Situation_profession_agreg = structure(c(1,
3, NA, 1, 2, NA, 1, 3, NA, 0, 2, NA, 1, 2, 2, NA, 1, 2, 1,
2), label = "Situation dans la profession habituelle durant les 12 dérniers mois", format.spss = "F1.0", labels = c(`Chômeur n'ayant jamais travaillé` = 0,
Inactif = 1, Salarié = 2, Indépendant = 3, Employeur = 4,
`Autres situation` = 5, `Non déclaré` = 9), class = c("haven_labelled",
"vctrs_vctr", "double"))), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
CodePudding user response:
Here's an approach where I summarize individuals
as a few lookup tables, each with various columns but only one row per N_ménage
, which can then be joined to the married_women
table using that as a key. Joins should be much faster than a looped lookup, especially as the data gets larger.
(As you tweak your code, check and make sure the lookups only ever have one row per N_ménage
, e.g. by running count(child_count, N_ménage) %>% count(n)
. That will count how many times each N_ménage appears (n), and then count those counts (nn). We should see only one n
value, and then a number nn
corresponding to the number of N_ménage where this facet of data is available. I wonder if you ever would face the situation of, for example, multiple husbands over the survey period; perhaps you would then want to take the average / highest / lowest education level...)
For the sample data there are many NAs, but I hope the general logic would apply to your larger data, and you'd presumably want to replace all the NA's with zeroes.
library(tidyverse)
child_count <- individuals %>% filter(Liendeparenté == 3) %>%
group_by(N_ménage) %>%
summarize(child_count = n(),
child_under6 = sum(Age <= 6))
husbands <- individuals %>% filter(Liendeparenté == 1) %>%
transmute(N_ménage, H_age = Age, H_edu = Niveau_scolaire_agreg)
parents_with <- individuals %>% filter(Liendeparenté %in% 6:7) %>%
count(N_ménage, type = case_when(
Liendeparenté == 7 & Sexe == 2 ~ "w_mum",
Liendeparenté == 6 & Sexe == 2 ~ "w_dad",
Liendeparenté == 7 & Sexe == 1 ~ "h_mum",
Liendeparenté == 6 & Sexe == 1 ~ "h_dad")) %>%
pivot_wider(names_from = type, values_from = n)
married_women %>%
select(N_ménage, Age, Niveau_scolaire_agreg) %>%
left_join(child_count) %>%
left_join(husbands) %>%
left_join(parents_with)