The ipumsr package helps import IPUMS extracts from the IPUMS website into R.
The ipumsr package is now on CRAN can be installed by running the following command:
install.packages("ipumsr")
Or, you can install the development version using the following commands:
if (!require(devtools)) install.packages("devtools")
devtools::install_github("mnpopcenter/ipumsr")
There are several vignettes included in the package:
You can access them with the vignette()
command (eg vignette("value-labels")
).
If you are installing from github and want the vignettes, you’ll need to run the following commands first:
devtools::install_github("mnpopcenter/ipumsr/ipumsexamples")
devtools::install_github("mnpopcenter/ipumsr", build_vignettes = TRUE)
suppressPackageStartupMessages({
library(ipumsr)
library(haven)
library(ggplot2) # ggplot2 version > 2.2.1 (development version as of 8/15/2017)
library(dplyr)
library(sf)
})
Relies on user downloading the .xml DDI file and the .dat/.dat.gz file (doesn’t need to be unzipped).
# Use example file included with package:
cps_hier_file <- ipums_example("cps_00010.xml")
ddi <- read_ipums_ddi(cps_hier_file)
data <- read_ipums_micro(ddi)
#> Use of data from IPUMS-CPS is subject to conditions including that users should
#> cite the data appropriately. Use command `ipums_conditions()` for more details.
#>
#> Reading data...
#> Parsing data...
# Variable description for the month variable
cat(ipums_var_desc(ddi, MONTH))
#> MONTH indicates the calendar month of the CPS interview.
# Hierarachical data loaded as a data frame
# Value labels loaded as haven::labelled class
# Convert to factors with `as_factor`
table(as_factor(data$MONTH, levels = "both"))
#>
#> [1] January [2] February [3] March [4] April [5] May
#> 0 0 3385 0 0
#> [6] June [7] July [8] August [9] September [10] October
#> 0 0 0 0 0
#> [11] November [12] December
#> 0 0
# Can also load as a list by rectype
data <- read_ipums_micro_list(ddi, verbose = FALSE)
# Household data
data$HOUSEHOLD
#> # A tibble: 3,385 x 6
#> RECTYPE YEAR SERIAL HWTSUPP STATEFIP MONTH
#> <chr+lbl> <dbl> <dbl> <dbl> <int+lbl> <int+lbl>
#> 1 H 1962 80 1475.59 55 3
#> 2 H 1962 82 1597.61 27 3
#> 3 H 1962 83 1706.65 27 3
#> 4 H 1962 84 1790.25 27 3
#> 5 H 1962 107 4355.40 19 3
#> 6 H 1962 108 1479.05 19 3
#> 7 H 1962 122 3602.75 27 3
#> 8 H 1962 124 4104.41 55 3
#> 9 H 1962 125 2182.17 55 3
#> 10 H 1962 126 1826.38 55 3
#> # ... with 3,375 more rows
# Person data
data$PERSON
#> # A tibble: 7,668 x 6
#> RECTYPE YEAR SERIAL PERNUM WTSUPP INCTOT
#> <chr+lbl> <dbl> <dbl> <dbl> <dbl> <dbl+lbl>
#> 1 P 1962 80 1 1475.59 4883
#> 2 P 1962 80 2 1470.72 5800
#> 3 P 1962 80 3 1578.75 99999998
#> 4 P 1962 82 1 1597.61 14015
#> 5 P 1962 83 1 1706.65 16552
#> 6 P 1962 84 1 1790.25 6375
#> 7 P 1962 107 1 4355.40 99999999
#> 8 P 1962 107 2 1385.81 0
#> 9 P 1962 107 3 1629.10 600
#> 10 P 1962 107 4 1432.24 99999999
#> # ... with 7,658 more rows
Relies on user downloading the .xml DDI file and the .dat/.dat.gz file (doesn’t need to be unzipped).
# Use example file included with package
cps_rect_file <- ipums_example("cps_00006.xml")
data <- read_ipums_micro(cps_rect_file, verbose = FALSE)
# While working interactively, can get convenient display of variable information
# in RStudio's viewer
ipums_view(data)
Relies on user downloading the csv file (with or without header row) and shape files (doesn’t need to be unzipped).
Note that to save space when including this data on CRAN, the shape file has been reduced to 1% of the points in the polygon of the PMSA. The original shape file can be found in the ipumsexamples
package.
data <- read_nhgis_sf(
ipums_example("nhgis0008_csv.zip"),
shape_file = ipums_example("nhgis0008_shape_small.zip"),
verbose = FALSE
)
ipums_var_info(data, starts_with("D6Z"))
#> # A tibble: 8 x 4
#> var_name var_label var_desc val_labels
#> <chr> <chr> <chr> <list>
#> 1 D6Z001 1989 to March 1990 Year Structure Built (D6Z) <tibble [0 x 2]>
#> 2 D6Z002 1985 to 1988 Year Structure Built (D6Z) <tibble [0 x 2]>
#> 3 D6Z003 1980 to 1984 Year Structure Built (D6Z) <tibble [0 x 2]>
#> 4 D6Z004 1970 to 1979 Year Structure Built (D6Z) <tibble [0 x 2]>
#> 5 D6Z005 1960 to 1969 Year Structure Built (D6Z) <tibble [0 x 2]>
#> 6 D6Z006 1950 to 1959 Year Structure Built (D6Z) <tibble [0 x 2]>
#> 7 D6Z007 1940 to 1949 Year Structure Built (D6Z) <tibble [0 x 2]>
#> 8 D6Z008 1939 or earlier Year Structure Built (D6Z) <tibble [0 x 2]>
data <- data %>%
mutate(
pct_before_1950 = (D6Z007 + D6Z008) /
(D6Z001 + D6Z002 + D6Z003 + D6Z004 + D6Z005 + D6Z006 + D6Z007 + D6Z008)
)
# Note the function `geom_sf()` is currently only in the development version,
# so you may need to update ggplot2 to run using
# devtools::install_github("tidyverse/ggplot2")
if ("geom_sf" %in% getNamespaceExports("ggplot2")) {
ggplot(data = data) +
geom_sf(aes(fill = pct_before_1950)) +
labs(
title = "Percent of homes built before 1950",
subtitle = "By Primary Metropolitan Statistical Area in 1990 Census",
caption = "Simplified PMSA boundaries (1% of polygon points retained)"
)
}
There is experimental support for for loading terrapop data, but examples are too large to include in the package.
# Raster data
data <- ipumsr:::read_terra_raster(
"2552_bundle.zip",
"CROPLAND2000ZM2013.tiff",
verbose = FALSE
)
# Area data
data <- ipumsr:::read_terra_area(
"2553_bundle.zip",
verbose = FALSE
)
# Microdata
data <- ipumsr:::read_terra_micro(
"2554_bundle.zip",
verbose = FALSE
)