in example_zoo/tensorflow/probability/deep_exponential_family/trainer/deep_exponential_family.py [0:0]
def load_nips2011_papers(path):
"""Loads NIPS 2011 conference papers.
The NIPS 1987-2015 data set is in the form of a 11,463 x 5,812 matrix of
per-paper word counts, containing 11,463 words and 5,811 NIPS conference
papers (Perrone et al., 2016). We subset to papers in 2011 and words appearing
in at least two documents and having a total word count of at least 10.
Built from the Observations Python package.
Args:
path: str.
Path to directory which either stores file or otherwise file will
be downloaded and extracted there. Filename is `NIPS_1987-2015.csv`.
Returns:
bag_of_words: np.ndarray of shape [num_documents, num_words]. Each element
denotes the number of occurrences of a specific word in a specific
document.
words: List of strings, denoting the words for `bag_of_words`'s columns.
"""
path = os.path.expanduser(path)
filename = "NIPS_1987-2015.csv"
filepath = os.path.join(path, filename)
if not os.path.exists(filepath):
url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/"
"00371/NIPS_1987-2015.csv")
if not tf.io.gfile.exists(path):
tf.io.gfile.makedirs(path)
print("Downloading %s to %s" % (url, filepath))
urllib.request.urlretrieve(url, filepath)
with open(filepath) as f:
iterator = csv.reader(f)
documents = next(iterator)[1:]
words = []
x_train = []
for row in iterator:
words.append(row[0])
x_train.append(row[1:])
x_train = np.array(x_train, dtype=np.int)
# Subset to documents in 2011 and words appearing in at least two documents
# and have a total word count of at least 10.
doc_idx = [i for i, document in enumerate(documents)
if document.startswith("2011")]
documents = [documents[doc] for doc in doc_idx]
x_train = x_train[:, doc_idx]
word_idx = np.logical_and(np.sum(x_train != 0, 1) >= 2,
np.sum(x_train, 1) >= 10)
words = [word for word, idx in zip(words, word_idx) if idx]
bag_of_words = x_train[word_idx, :].T
return bag_of_words, words