in spinoffs/inference_gym/inference_gym/internal/data.py [0:0]
def radon(
state,
num_examples,
num_counties,
drop_floors_above_1=True,
train_fraction=1.,
shuffle=False,
shuffle_seed=42):
"""The Radon dataset [1] loaded from `tensorflow_datasets`.
Radon is a radioactive gas that enters homes through contact points with the
ground. It is a carcinogen that is the primary cause of lung cancer in
non-smokers. Radon levels vary greatly from household to household.
This dataset contains measured radon concentrations from houses in the
United States and associated features.
Args:
state: `str`, the two-character code for the U.S. state by which to filter
the data. If `None`, data for all states are returned.
num_examples: `int`, total number of examples in the filtered dataset. When
the dataset is materialized, this value is verified against the size of
the filtered dataset, and if there is a mismatch a ValueError is raised.
(The value is not determined automatically from the dataset because it is
needed before the dataset is materialized).
num_counties: `int`, number of unique counties in the filtered dataset. When
the dataset is materialized, this value is verified against the size of
the filtered dataset, and if there is a mismatch a ValueError is raised.
(The value is not determined automatically from the dataset because it is
needed before the dataset is materialized).
drop_floors_above_1: Whether to drop datapoints with `floor` being greater
than 1. In particular, floor equals 9 when it is unknown and the standard
model uses the floor as a boolean indicator.
train_fraction: What fraction of the data to put in the training set.
shuffle: `bool`. If `True`, shuffle the data.
shuffle_seed: `int`, RNG seed to use if shuffling the data.
Returns:
dataset: A Dict with the following keys:
`num_counties`: `int`, number of unique counties in the filtered dataset.
`train_log_uranium`: Floating-point `Tensor` with shape
`[num_train]`. Soil uranium measurements.
`train_floor`: Integer `Tensor` with shape `[num_train]`. Floor of the
house on which the measurement was taken.
`train_county`: Integer `Tensor` with values in `range(0, num_counties)`
of shape `[num_train]`. County in which the measurement was taken.
`train_floor_by_county`: Floating-point `Tensor` with shape
`[num_train]`. Average floor on which the measurement was taken for the
county in which each house is located (the `Tensor` will have
`num_counties` unique values). This represents the contextual effect.
`train_log_radon`: Floating-point `Tensor` with shape `[num_train]`.
Radon measurement for each house (the dependent variable in the model).
`test_log_uranium`: Floating-point `Tensor` with shape `[num_test`. Soil
Soil uranium measurements for the test set.
`test_floor`: Integer `Tensor` with shape `[num_test]`. Floor of the house
house on which the measurement was taken.
`test_county`: Integer `Tensor` with values in `range(0, num_counties)` of
shape `[num_test]`. County in which the measurement was taken. Can be
`None`, in which case test-related sample transformations are not
computed.
`test_floor_by_county`: Floating-point `Tensor` with shape
`[num_test]`. Average floor on which the measurement was taken
(calculated from the training set) for the county in which each house is
located (the `Tensor` will have `num_counties` unique values). This
represents the contextual effect.
`test_log_radon`: Floating-point `Tensor` with shape `[num_test]`. Radon
measurement for each house (the dependent variable in the model).
Raises:
ValueError if `num_examples` is not equal to the number of examples in the
materialized dataset.
ValueError if `num_counties` is not equal to the number of unique counties
in the materialized dataset.
#### References
[1] Gelman, A., & Hill, J. (2007). Data Analysis Using Regression and
Multilevel/Hierarchical Models (1st ed.). Cambridge University Press.
http://www.stat.columbia.edu/~gelman/arm/examples/radon/
"""
num_train = int(num_examples * train_fraction)
num_test = num_examples - num_train
def load_dataset():
"""Function that actually loads the dataset."""
if load_dataset.dataset is not None:
return load_dataset.dataset
with tf.name_scope('radon'), tf.init_scope():
dataset = _tfds().load(name='radon:1.*.*', split='train', batch_size=-1)
dataset = _tfds().as_numpy(dataset)
states = dataset['features']['state'].astype('U13')
floor = dataset['features']['floor'].astype(np.int32)
radon_val = dataset['activity'].astype(np.float64)
county_strings = dataset['features']['county'].astype('U13')
uranium = dataset['features']['Uppm'].astype(np.float64)
if state is not None:
floor = floor[states == state]
radon_val = radon_val[states == state]
county_strings = county_strings[states == state]
uranium = uranium[states == state]
if drop_floors_above_1:
keep = floor <= 1
floor = floor[keep]
radon_val = radon_val[keep]
county_strings = county_strings[keep]
uranium = uranium[keep]
radon_val[radon_val <= 0.] = 0.1
log_radon = np.log(radon_val)
log_uranium = np.log(uranium)
unique_counties, county = np.unique(county_strings, return_inverse=True)
county = county.astype(np.int32)
if log_radon.size != num_examples:
raise ValueError(
'The size of the filtered dataset must equal the input '
'`num_examples`. Saw dataset size = {}, `num_examples` = {}'
''.format(log_radon.size, num_examples))
if unique_counties.size != num_counties:
raise ValueError(
'The number of counties present in the filtered dataset must equal '
'the input `num_counties`. Saw {} counties but `num_counties` = {}'
''.format(unique_counties.size, num_counties))
if shuffle:
shuffle_idxs = np.arange(num_examples)
np.random.RandomState(shuffle_seed).shuffle(shuffle_idxs)
log_uranium = log_uranium[shuffle_idxs]
floor = floor[shuffle_idxs]
county = county[shuffle_idxs]
log_radon = log_radon[shuffle_idxs]
train_floor = floor[:num_train]
train_county = county[:num_train]
test_floor = floor[num_train:]
test_county = county[num_train:]
# Create a new features for mean of floor across counties.
xbar = []
for i in range(num_counties):
xbar.append(floor[county == i].mean())
floor_by_county = np.array(xbar, dtype=log_radon.dtype)
load_dataset.dataset = dict(
train_log_uranium=log_uranium[:num_train],
train_floor=train_floor,
train_county=train_county,
train_floor_by_county=floor_by_county[train_county],
train_log_radon=log_radon[:num_train],
test_log_uranium=log_uranium[num_train:],
test_floor=test_floor,
test_county=test_county,
test_floor_by_county=floor_by_county[test_county],
test_log_radon=log_radon[num_train:],
)
return load_dataset.dataset
load_dataset.dataset = None
return dict(
num_counties=np.array(num_counties, dtype=np.int32),
train_log_uranium=_defer(
lambda: load_dataset()['train_log_uranium'],
shape=[num_train],
dtype=np.float64),
train_floor=_defer(
lambda: load_dataset()['train_floor'],
shape=[num_train],
dtype=np.int32),
train_county=_defer(
lambda: load_dataset()['train_county'],
shape=[num_train],
dtype=np.int32),
train_floor_by_county=_defer(
lambda: load_dataset()['train_floor_by_county'],
shape=[num_train],
dtype=np.float64),
train_log_radon=_defer(
lambda: load_dataset()['train_log_radon'],
shape=[num_train],
dtype=np.float64),
test_log_uranium=_defer(
lambda: load_dataset()['test_log_uranium'],
shape=[num_test],
dtype=np.float64),
test_floor=_defer(
lambda: load_dataset()['test_floor'],
shape=[num_test],
dtype=np.int32),
test_county=_defer(
lambda: load_dataset()['test_county'],
shape=[num_test],
dtype=np.int32),
test_floor_by_county=_defer(
lambda: load_dataset()['test_floor_by_county'],
shape=[num_test],
dtype=np.float64),
test_log_radon=_defer(
lambda: load_dataset()['test_log_radon'],
shape=[num_test],
dtype=np.float64),
)