distilbertqatrain.py [149:173]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    train_contexts, train_questions, train_answers = read_squad(train_file)
    val_contexts, val_questions, val_answers = read_squad(test_file)
    
    #get the character position at which the answer ends in the passage
    add_end_idx(train_answers, train_contexts)
    add_end_idx(val_answers, val_contexts)
    
    # Use only a subset of data for training. Remove this block to train over entire data
    train_contexts=train_contexts[0:200]
    train_questions=train_questions[0:200]
    train_answers=train_answers[0:200]
    val_contexts=val_contexts[0:200]
    vl_questions=val_questions[0:200]
    val_answers=val_answers[0:200]
    
    #tokenize our context/question pairs.
    train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
    val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)    
    
    # convert our character start/end positions to token start/end positions
    add_token_positions(train_encodings, train_answers)
    add_token_positions(val_encodings, val_answers)    
    
    train_dataset = SquadDataset(train_encodings)
    test_dataset = SquadDataset(val_encodings)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


scripts/train.py [130:154]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    train_contexts, train_questions, train_answers = read_squad(train_file)
    val_contexts, val_questions, val_answers = read_squad(test_file)
    
    #get the character position at which the answer ends in the passage
    add_end_idx(train_answers, train_contexts)
    add_end_idx(val_answers, val_contexts)
    
    # Use only a subset of data for training. Remove this block to train over entire data
    train_contexts=train_contexts[0:200]
    train_questions=train_questions[0:200]
    train_answers=train_answers[0:200]
    val_contexts=val_contexts[0:200]
    vl_questions=val_questions[0:200]
    val_answers=val_answers[0:200]
    
    #tokenize our context/question pairs.
    train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
    val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)    
    
    # convert our character start/end positions to token start/end positions
    add_token_positions(train_encodings, train_answers)
    add_token_positions(val_encodings, val_answers)    
    
    train_dataset = SquadDataset(train_encodings)
    test_dataset = SquadDataset(val_encodings)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -