<- function(n) {
generate_case_ids <- 201011000001
start_id <- start_id + n-1
end_id <- as.character(seq(start_id, end_id, by = 1))
case_ids return(case_ids)
}
10 Appendix A: Sample Data
10.1 Introduction
The sample data used in this book was generated from the Malawi Intergrated Household Survey Fifth Edition 2018-2019 downloaded from here.
The data was generated randomly using the following code:
10.2 Define functions used
10.2.1 Create case_id generation
10.2.2 Create HHID generation function
<- function(n) {
generate_HHIDs <- sapply(1:n, function(x) {
hhids paste(sample(c(0:9, letters[1:6]), 32, replace = TRUE), collapse = "")
})return(hhids)
}
10.3 Set seed and number of households to generate
# Set seed
set.seed(123)
# Set number of households to generate
<- 100 households
10.4 Load Original data and extract food and unit lists
# Import Malawi IHS5 HCES consumption module data
<-
original_data ::read_dta(here::here("data-ignore", "IHS5", "HH_MOD_G1.dta"))
haven
# Extract "standard" food list from the original data
<-
food_list |>
original_data ::select(hh_g02) |>
dplyr::distinct()
dplyr
# Extract "non-standard" food lists from the original data
<-
other_food_list_codes |>
original_data ::distinct(hh_g02, hh_g01_oth) |>
dplyr::filter(hh_g01_oth != "") |>
dplyr::distinct(hh_g02) |>
dplyr::arrange()
dplyr<-
other_food_list_options |>
original_data ::distinct(hh_g02, hh_g01_oth) |>
dplyr::filter(hh_g01_oth != "")
dplyr
# Extract Food unit lists from the original data
<-
food_unit_lists |>
original_data ::distinct(hh_g03b, hh_g03b_label, hh_g03b_oth, hh_g03c, hh_g03c_1)
dplyr
# Extract the length of Number of foods from the food list
<- length(food_list$hh_g02) n_foods
10.5 Data creation
10.5.1 Create HHIDs
# Creeate case_ids
<- generate_case_ids(households)
case_id # Generate HHIDs
<- generate_HHIDs(households) hhids
10.5.2 Create data
<- tibble::tibble(
sample_data case_id = rep(case_id, each = n_foods),
HHID = rep(hhids, each = n_foods),
hh_g00_1 = 2,
hh_g00_2 = 2,
|> dplyr::slice(rep(1:dplyr::n(), households)),
food_list hh_g01 = sample(
$hh_g01,
original_data# replace = T,
size = households * 142
)|>
)
# Add "other food items"
::rowwise() |>
dplyr::mutate(
dplyrhh_g01_oth = dplyr::case_when(
== 414 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 414) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 515 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 515) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 117 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 117) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 830 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 830) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 310 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 310) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 412 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 412) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 610 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter( other_food_list_options,hh_g02 == 610) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 916 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 916) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 209 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 209) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 709 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 709) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 818 &
hh_g02 == 1 ~ sample(
hh_g01 ::filter(other_food_list_options,hh_g02 == 818) |> dplyr::pull(hh_g01_oth),
dplyr1
),== 804 &
hh_g02 == 1 ~ sample(dplyr::filter(other_food_list_options,hh_g02 == 804) |> dplyr::pull(hh_g01_oth),
hh_g01 1
),TRUE ~ ""
)|>
) ::mutate(hh_g03a = dplyr::case_when(hh_g01 == 1 ~ sample(c(1:10, 0.5:10), 1),
dplyrTRUE ~ NA)) |>
::rowwise() |>
dplyr::mutate(unit_key = dplyr::case_when(hh_g01 == 1 ~ sample(1:214, 1), TRUE ~
dplyrNA)) |>
::mutate(
dplyrhh_g03b = food_unit_lists$hh_g03b[unit_key],
hh_g03b_label = food_unit_lists$hh_g03b_label[unit_key],
hh_g03b_oth = food_unit_lists$hh_g03b_oth[unit_key],
hh_g03c = food_unit_lists$hh_g03c[unit_key],
hh_g03c_1 = food_unit_lists$hh_g03c_1[unit_key]
|>
) ::select(
dplyr-unit_key,
"case_id",
"HHID",
"hh_g00_1",
"hh_g00_2",
"hh_g01",
"hh_g01_oth",
"hh_g02",
"hh_g03a",
"hh_g03b",
"hh_g03b_label",
"hh_g03b_oth",
"hh_g03c",
"hh_g03c_1"
)
# Add the rest of the columns
<- original_data |> dplyr::filter(is.na(case_id)) |>
sample_data ::bind_rows(sample_data) dplyr
# Attach stata column labels
for (i in names(sample_data)){
attr(sample_data[[i]], "label") <- attr(original_data[[i]], "label")
}
# Export sample data as stata file
::write_dta(sample_data,here::here("data","sample_data","MWI-IHSV","HH_MOD_G1_vMAPS.dta")) haven
10.5.3 Create hh_mod_a_filt.dta
file
|>
sample_data ::select(case_id,HHID) |>
dplyr::distinct() |>
dplyr::rowwise() |>
dplyr::mutate(region = sample(1:3,1)) |>
dplyr::write_dta(here::here("data","sample_data","MWI-IHSV","hh_mod_a_filt_vMAPS.dta")) haven
10.5.4 Create hh_roster.dta
# Import original roster from IHS5
<- haven::read_dta(here::here("data-ignore", "IHS5", "HH_MOD_B.dta"))
ihs5_roster
# create a dataframe with the case_ids and HHIDs of our sample data
<- sample_data |> dplyr::distinct(case_id,HHID)
sample_roster
# replicate each row a random number of times between 1 and 10 to simulate household members
<- sample(1:10, nrow(sample_roster), replace = TRUE)
n <- sample_roster[rep(seq_len(nrow(sample_roster)), times = n), ]
sample_roster
# Create other variables
<- sample_roster |>
sample_roster ::rowwise() |>
dplyr::mutate(hh_b03 = sample(ihs5_roster$hh_b03,1),
dplyrhh_b05a = sample(ihs5_roster$hh_b05a,1),
hh_b05b = dplyr::case_when(hh_b05a < 5~sample(1:11,1),TRUE~NA))
# Add the other blank columns from the original dataset
<- ihs5_roster |>
sample_roster ::filter(case_id == "") |>
dplyr::bind_rows(sample_roster)
dplyr
# Attach stata column labels
for (i in names(sample_roster)){
attr(sample_roster[[i]], "label") <- attr(ihs5_roster[[i]], "label")
}
# writeout the sample_ihs5_roster
::write_dta(sample_roster,here::here("data","sample_data","MWI-IHSV","HH_MOD_B_vMAPS.dta")) haven
10.5.5 Create sample “HH_MOD_D.dta”
# import original data
<- haven::read_dta(here::here("data-ignore", "IHS5", "HH_MOD_D.dta"))
original_health
# Use the sample_roster to create a sample_health dataset
<- sample_roster |>
sample_health ::select(case_id,HHID) |>
dplyr::rowwise()|>
dplyr::mutate(hh_d05a = sample(c(original_health$hh_d05a),1),
dplyrhh_d05b = sample(original_health$hh_d05b,1))
# Add the other blank columns from the original dataset
<- original_health |>
sample_health ::filter(case_id == "") |>
dplyr::bind_rows(sample_health)
dplyr
# Attach stata column labels
for (i in names(sample_health)){
attr(sample_health[[i]], "label") <- attr(sample_health[[i]], "label")
}
# writeout the sample_ihs5_roster
::write_dta(sample_health,here::here("data","sample_data","MWI-IHSV","HH_MOD_D_vMAPS.dta")) haven