forked from AllenDowney/ThinkStats2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnsfg.py
165 lines (120 loc) · 4.45 KB
/
nsfg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""This file contains code for use with "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
from __future__ import print_function, division
import sys
import numpy as np
import thinkstats2
from collections import defaultdict
def ReadFemResp(dct_file='2002FemResp.dct',
dat_file='2002FemResp.dat.gz',
nrows=None):
"""Reads the NSFG respondent data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file)
df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
CleanFemResp(df)
return df
def CleanFemResp(df):
"""Recodes variables from the respondent frame.
df: DataFrame
"""
pass
def ReadFemPreg(dct_file='2002FemPreg.dct',
dat_file='2002FemPreg.dat.gz'):
"""Reads the NSFG pregnancy data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file)
df = dct.ReadFixedWidth(dat_file, compression='gzip')
CleanFemPreg(df)
return df
def CleanFemPreg(df):
"""Recodes variables from the pregnancy frame.
df: DataFrame
"""
# mother's age is encoded in centiyears; convert to years
df.agepreg /= 100.0
# birthwgt_lb contains at least one bogus value (51 lbs)
# replace with NaN
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
# replace 'not ascertained', 'refused', 'don't know' with NaN
na_vals = [97, 98, 99]
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
df.hpagelb.replace(na_vals, np.nan, inplace=True)
df.babysex.replace([7, 9], np.nan, inplace=True)
df.nbrnaliv.replace([9], np.nan, inplace=True)
# birthweight is stored in two columns, lbs and oz.
# convert to a single column in lb
# NOTE: creating a new column requires dictionary syntax,
# not attribute assignment (like df.totalwgt_lb)
df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0
# due to a bug in ReadStataDct, the last variable gets clipped;
# so for now set it to NaN
df.cmintvw = np.nan
def ValidatePregnum(resp, preg):
"""Validate pregnum in the respondent file.
resp: respondent DataFrame
preg: pregnancy DataFrame
"""
# make the map from caseid to list of pregnancy indices
preg_map = MakePregMap(preg)
# iterate through the respondent pregnum series
for index, pregnum in resp.pregnum.iteritems():
caseid = resp.caseid[index]
indices = preg_map[caseid]
# check that pregnum from the respondent file equals
# the number of records in the pregnancy file
if len(indices) != pregnum:
print(caseid, len(indices), pregnum)
return False
return True
def MakePregMap(df):
"""Make a map from caseid to list of preg indices.
df: DataFrame
returns: dict that maps from caseid to list of indices into `preg`
"""
d = defaultdict(list)
for index, caseid in df.caseid.iteritems():
d[caseid].append(index)
return d
def main():
"""Tests the functions in this module.
script: string script name
"""
# read and validate the respondent file
resp = ReadFemResp()
assert(len(resp) == 7643)
assert(resp.pregnum.value_counts()[1] == 1267)
# read and validate the pregnancy file
preg = ReadFemPreg()
print(preg.shape)
assert len(preg) == 13593
assert preg.caseid[13592] == 12571
assert preg.pregordr.value_counts()[1] == 5033
assert preg.nbrnaliv.value_counts()[1] == 8981
assert preg.babysex.value_counts()[1] == 4641
assert preg.birthwgt_lb.value_counts()[7] == 3049
assert preg.birthwgt_oz.value_counts()[0] == 1037
assert preg.prglngth.value_counts()[39] == 4744
assert preg.outcome.value_counts()[1] == 9148
assert preg.birthord.value_counts()[1] == 4413
assert preg.agepreg.value_counts()[22.75] == 100
assert preg.totalwgt_lb.value_counts()[7.5] == 302
weights = preg.finalwgt.value_counts()
key = max(weights.keys())
assert preg.finalwgt.value_counts()[key] == 6
# validate that the pregnum column in `resp` matches the number
# of entries in `preg`
assert(ValidatePregnum(resp, preg))
print('All tests passed.')
if __name__ == '__main__':
main()