def GloVe()

in torchtext/experimental/vectors.py [0:0]
41 lines of code
8 McCabe index (conditional complexity)

def GloVe(name="840B", dim=300, unk_tensor=None, root=".data", validate_file=True, num_cpus=32):
    r"""Create a GloVe Vectors object.

    Args:
        name (str): the name of the GloVe dataset to use. Options are:

            - 42B
            - 840B
            - twitter.27B
            - 6B
        dim (int): the dimension for the GloVe dataset to load. Options are:

            42B:

                - 300

            840B:

                - 300

            twitter.27B:

                - 25
                - 50
                - 100
                - 200

            6B:

                - 50
                - 100
                - 200
                - 300
        unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token.
        root (str): folder used to store downloaded files in (.data)
        validate_file (bool): flag to determine whether to validate the downloaded files checksum.
                              Should be `False` when running tests with a local asset.
        num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10.
    Returns:
        torchtext.experimental.vectors.Vector: a Vectors object.

    Raises:
        ValueError: if unexpected duplicate tokens are found in GloVe file.

    """
    dup_token_glove_840b = ["����������������������������������������������������������������������"
                            "����������������������������������������������������������������������"
                            "����������������������������������������������������������������������"
                            "����������������������������������������������������������������������"
                            "������������������������������������������������������"]
    urls = {
        "42B": "https://nlp.stanford.edu/data/glove.42B.300d.zip",
        "840B": "https://nlp.stanford.edu/data/glove.840B.300d.zip",
        "twitter.27B": "https://nlp.stanford.edu/data/glove.twitter.27B.zip",
        "6B": "https://nlp.stanford.edu/data/glove.6B.zip",
    }
    valid_glove_file_names = {
        "glove.42B.300d.txt",
        "glove.840B.300d.txt",
        "glove.twitter.27B.25d.txt",
        "glove.twitter.27B.50d.txt",
        "glove.twitter.27B.100d.txt",
        "glove.twitter.27B.200d.txt",
        "glove.6B.50d.txt",
        "glove.6B.100d.txt",
        "glove.6B.200d.txt",
        "glove.6B.300d.txt"
    }

    file_name = "glove.{}.{}d.txt".format(name, str(dim))
    if file_name not in valid_glove_file_names:
        raise ValueError("Could not find GloVe file with name {}. Please check that `name` and `dim`"
                         "are valid.".format(str(file_name)))

    url = urls[name]
    checksum = None
    if validate_file:
        checksum = CHECKSUMS_GLOVE.get(url, None)

    downloaded_file_path = download_from_url(url, root=root, hash_value=checksum)
    extracted_file_paths = extract_archive(downloaded_file_path)
    # need to get the full path to the correct file in the case when multiple files are extracted with different dims
    extracted_file_path_with_correct_dim = [path for path in extracted_file_paths if file_name in path][0]
    cpp_vectors_obj, dup_tokens = _load_token_and_vectors_from_file(extracted_file_path_with_correct_dim, ' ', num_cpus, unk_tensor)

    # Ensure there is only 1 expected duplicate token present for 840B dataset
    if dup_tokens and dup_tokens != dup_token_glove_840b:
        raise ValueError("Found duplicate tokens in file: {}".format(str(dup_tokens)))

    vectors_obj = Vectors(cpp_vectors_obj)
    return vectors_obj