def load_nips2011_papers()

in example_zoo/tensorflow/probability/deep_exponential_family/trainer/deep_exponential_family.py [0:0]


def load_nips2011_papers(path):
  """Loads NIPS 2011 conference papers.

  The NIPS 1987-2015 data set is in the form of a 11,463 x 5,812 matrix of
  per-paper word counts, containing 11,463 words and 5,811 NIPS conference
  papers (Perrone et al., 2016). We subset to papers in 2011 and words appearing
  in at least two documents and having a total word count of at least 10.

  Built from the Observations Python package.

  Args:
    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there. Filename is `NIPS_1987-2015.csv`.

  Returns:
    bag_of_words: np.ndarray of shape [num_documents, num_words]. Each element
      denotes the number of occurrences of a specific word in a specific
      document.
    words: List of strings, denoting the words for `bag_of_words`'s columns.
  """
  path = os.path.expanduser(path)
  filename = "NIPS_1987-2015.csv"
  filepath = os.path.join(path, filename)
  if not os.path.exists(filepath):
    url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/"
           "00371/NIPS_1987-2015.csv")
    if not tf.io.gfile.exists(path):
      tf.io.gfile.makedirs(path)
    print("Downloading %s to %s" % (url, filepath))
    urllib.request.urlretrieve(url, filepath)

  with open(filepath) as f:
    iterator = csv.reader(f)
    documents = next(iterator)[1:]
    words = []
    x_train = []
    for row in iterator:
      words.append(row[0])
      x_train.append(row[1:])

  x_train = np.array(x_train, dtype=np.int)

  # Subset to documents in 2011 and words appearing in at least two documents
  # and have a total word count of at least 10.
  doc_idx = [i for i, document in enumerate(documents)
             if document.startswith("2011")]
  documents = [documents[doc] for doc in doc_idx]
  x_train = x_train[:, doc_idx]
  word_idx = np.logical_and(np.sum(x_train != 0, 1) >= 2,
                            np.sum(x_train, 1) >= 10)
  words = [word for word, idx in zip(words, word_idx) if idx]
  bag_of_words = x_train[word_idx, :].T
  return bag_of_words, words