def load_bart

def load_bart_od()

in pyro/contrib/examples/bart.py [0:0]
44 lines of code
8 McCabe index (conditional complexity)

def load_bart_od():
    """
    Load a dataset of hourly origin-destination ridership counts for every pair
    of BART stations during the years 2011-2019.

    **Source** https://www.bart.gov/about/reports/ridership

    This downloads the dataset the first time it is called. On subsequent calls
    this reads from a local cached file ``.pkl.bz2``. This attempts to
    download a preprocessed compressed cached file maintained by the Pyro team.
    On cache hit this should be very fast. On cache miss this falls back to
    downloading the original data source and preprocessing the dataset,
    requiring about 350MB of file transfer, storing a few GB of temp files, and
    taking upwards of 30 minutes.

    :returns: a dataset is a dictionary with fields:

        -   "stations": a list of strings of station names
        -   "start_date": a :py:class:`datetime.datetime` for the first observaion
        -   "counts": a ``torch.FloatTensor`` of ridership counts, with shape
            ``(num_hours, len(stations), len(stations))``.
    """
    _mkdir_p(DATA)
    filename = os.path.join(DATA, "bart_full.pkl.bz2")
    # Work around apparent bug in torch.load(),torch.save().
    pkl_file = filename.rsplit(".", 1)[0]
    if not os.path.exists(pkl_file):
        try:
            urllib.request.urlretrieve(CACHE_URL, filename)
            logging.debug("cache hit, uncompressing")
            with bz2.BZ2File(filename) as src, open(filename[:-4], "wb") as dst:
                dst.write(src.read())
        except urllib.error.HTTPError:
            logging.debug("cache miss, preprocessing from scratch")
    if os.path.exists(pkl_file):
        return torch.load(pkl_file)

    filenames = multiprocessing.Pool(len(SOURCE_FILES)).map(_load_hourly_od, SOURCE_FILES)
    datasets = list(map(torch.load, filenames))

    stations = sorted(set().union(*(d["stations"].keys() for d in datasets)))
    min_time = min(int(d["rows"][:, 0].min()) for d in datasets)
    max_time = max(int(d["rows"][:, 0].max()) for d in datasets)
    num_rows = max_time - min_time + 1
    start_date = datasets[0]["start_date"] + datetime.timedelta(hours=min_time),
    logging.info("Loaded data from {} stations, {} hours"
                 .format(len(stations), num_rows))

    result = torch.zeros(num_rows, len(stations), len(stations))
    for dataset in datasets:
        part_stations = sorted(dataset["stations"], key=dataset["stations"].__getitem__)
        part_to_whole = torch.tensor(list(map(stations.index, part_stations)))
        time = dataset["rows"][:, 0] - min_time
        origin = part_to_whole[dataset["rows"][:, 1]]
        destin = part_to_whole[dataset["rows"][:, 2]]
        count = dataset["rows"][:, 3].float()
        result[time, origin, destin] = count
        dataset.clear()
    logging.info("Loaded {} shaped data of mean {:0.3g}"
                 .format(result.shape, result.mean()))

    dataset = {
        "stations": stations,
        "start_date": start_date,
        "counts": result,
    }
    torch.save(dataset, pkl_file)
    subprocess.check_call(["bzip2", "-k", pkl_file])
    assert os.path.exists(filename)
    return dataset