in torchtext/experimental/vectors.py [0:0]
def GloVe(name="840B", dim=300, unk_tensor=None, root=".data", validate_file=True, num_cpus=32):
r"""Create a GloVe Vectors object.
Args:
name (str): the name of the GloVe dataset to use. Options are:
- 42B
- 840B
- twitter.27B
- 6B
dim (int): the dimension for the GloVe dataset to load. Options are:
42B:
- 300
840B:
- 300
twitter.27B:
- 25
- 50
- 100
- 200
6B:
- 50
- 100
- 200
- 300
unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token.
root (str): folder used to store downloaded files in (.data)
validate_file (bool): flag to determine whether to validate the downloaded files checksum.
Should be `False` when running tests with a local asset.
num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10.
Returns:
torchtext.experimental.vectors.Vector: a Vectors object.
Raises:
ValueError: if unexpected duplicate tokens are found in GloVe file.
"""
dup_token_glove_840b = ["����������������������������������������������������������������������"
"����������������������������������������������������������������������"
"����������������������������������������������������������������������"
"����������������������������������������������������������������������"
"������������������������������������������������������"]
urls = {
"42B": "https://nlp.stanford.edu/data/glove.42B.300d.zip",
"840B": "https://nlp.stanford.edu/data/glove.840B.300d.zip",
"twitter.27B": "https://nlp.stanford.edu/data/glove.twitter.27B.zip",
"6B": "https://nlp.stanford.edu/data/glove.6B.zip",
}
valid_glove_file_names = {
"glove.42B.300d.txt",
"glove.840B.300d.txt",
"glove.twitter.27B.25d.txt",
"glove.twitter.27B.50d.txt",
"glove.twitter.27B.100d.txt",
"glove.twitter.27B.200d.txt",
"glove.6B.50d.txt",
"glove.6B.100d.txt",
"glove.6B.200d.txt",
"glove.6B.300d.txt"
}
file_name = "glove.{}.{}d.txt".format(name, str(dim))
if file_name not in valid_glove_file_names:
raise ValueError("Could not find GloVe file with name {}. Please check that `name` and `dim`"
"are valid.".format(str(file_name)))
url = urls[name]
checksum = None
if validate_file:
checksum = CHECKSUMS_GLOVE.get(url, None)
downloaded_file_path = download_from_url(url, root=root, hash_value=checksum)
extracted_file_paths = extract_archive(downloaded_file_path)
# need to get the full path to the correct file in the case when multiple files are extracted with different dims
extracted_file_path_with_correct_dim = [path for path in extracted_file_paths if file_name in path][0]
cpp_vectors_obj, dup_tokens = _load_token_and_vectors_from_file(extracted_file_path_with_correct_dim, ' ', num_cpus, unk_tensor)
# Ensure there is only 1 expected duplicate token present for 840B dataset
if dup_tokens and dup_tokens != dup_token_glove_840b:
raise ValueError("Found duplicate tokens in file: {}".format(str(dup_tokens)))
vectors_obj = Vectors(cpp_vectors_obj)
return vectors_obj