def build_vocab_recipe1m()

in src/utils/recipe1m_utils.py [0:0]


def build_vocab_recipe1m(args):
    print("Loading data...")

    args.save_path = os.path.join(args.recipe1m_path, 'preprocessed')
    if not os.path.exists(args.save_path):
        os.mkdir(args.save_path)

    dets = json.load(open(os.path.join(args.recipe1m_path, 'det_ingrs.json'), 'r'))
    layer1 = json.load(open(os.path.join(args.recipe1m_path, 'layer1.json'), 'r'))
    layer2 = json.load(open(os.path.join(args.recipe1m_path, 'layer2.json'), 'r'))

    id2im = {}

    for i, entry in enumerate(layer2):
        id2im[entry['id']] = i

    print("Loaded data.")
    print("Found %d recipes in the dataset." % (len(layer1)))
    replace_dict_ingrs = {'and': ['&', "'n"], '': ['%', ',', '.', '#', '[', ']', '!', '?']}
    replace_dict_instrs = {'and': ['&', "'n"], '': ['#', '[', ']']}

    idx2ind = {}
    for i, entry in enumerate(dets):
        idx2ind[entry['id']] = i

    ingrs_file = args.save_path + 'allingrs_count.pkl'
    #####
    # 1. Count words in dataset and clean
    #####
    if os.path.exists(ingrs_file) and not args.forcegen:
        print("loading pre-extracted word counters")
        counter_ingrs = pickle.load(open(args.save_path + 'allingrs_count.pkl', 'rb'))
    else:
        counter_ingrs = Counter()

        for i, entry in tqdm(enumerate(layer1)):

            # get all instructions for this recipe
            instrs = entry['instructions']

            instrs_list = []
            ingrs_list = []

            # retrieve pre-detected ingredients for this entry
            det_ingrs = dets[idx2ind[entry['id']]]['ingredients']

            valid = dets[idx2ind[entry['id']]]['valid']
            det_ingrs_filtered = []

            for j, det_ingr in enumerate(det_ingrs):
                if len(det_ingr) > 0 and valid[j]:
                    det_ingr_undrs = get_ingredient(det_ingr, replace_dict_ingrs)
                    det_ingrs_filtered.append(det_ingr_undrs)
                    ingrs_list.append(det_ingr_undrs)

            # get raw text for instructions of this entry
            acc_len = 0
            for instr in instrs:
                instr = instr['text']
                instr = get_instruction(instr, replace_dict_instrs)
                if len(instr) > 0:
                    instrs_list.append(instr)
                    acc_len += len(instr)

            # discard recipes with too few or too many ingredients or instruction words
            if len(ingrs_list) < args.minnumingrs or len(instrs_list) < args.minnuminstrs \
                    or len(instrs_list) >= args.maxnuminstrs or len(ingrs_list) >= args.maxnumingrs \
                    or acc_len < args.minnumwords:
                continue

            if entry['partition'] == 'train':
                counter_ingrs.update(ingrs_list)

        pickle.dump(counter_ingrs, open(args.save_path + 'allingrs_count.pkl', 'wb'))

    # manually add missing entries for better clustering
    base_words = [
        'peppers', 'tomato', 'spinach_leaves', 'turkey_breast', 'lettuce_leaf', 'chicken_thighs',
        'milk_powder', 'bread_crumbs', 'onion_flakes', 'red_pepper', 'pepper_flakes',
        'juice_concentrate', 'cracker_crumbs', 'hot_chili', 'seasoning_mix', 'dill_weed',
        'pepper_sauce', 'sprouts', 'cooking_spray', 'cheese_blend', 'basil_leaves',
        'pineapple_chunks', 'marshmallow', 'chile_powder', 'cheese_blend', 'corn_kernels',
        'tomato_sauce', 'chickens', 'cracker_crust', 'lemonade_concentrate', 'red_chili',
        'mushroom_caps', 'mushroom_cap', 'breaded_chicken', 'frozen_pineapple', 'pineapple_chunks',
        'seasoning_mix', 'seaweed', 'onion_flakes', 'bouillon_granules', 'lettuce_leaf',
        'stuffing_mix', 'parsley_flakes', 'chicken_breast', 'basil_leaves', 'baguettes',
        'green_tea', 'peanut_butter', 'green_onion', 'fresh_cilantro', 'breaded_chicken',
        'hot_pepper', 'dried_lavender', 'white_chocolate', 'dill_weed', 'cake_mix', 'cheese_spread',
        'turkey_breast', 'chucken_thighs', 'basil_leaves', 'mandarin_orange', 'laurel',
        'cabbage_head', 'pistachio', 'cheese_dip', 'thyme_leave', 'boneless_pork', 'red_pepper',
        'onion_dip', 'skinless_chicken', 'dark_chocolate', 'canned_corn', 'muffin', 'cracker_crust',
        'bread_crumbs', 'frozen_broccoli', 'philadelphia', 'cracker_crust', 'chicken_breast'
    ]

    for base_word in base_words:

        if base_word not in counter_ingrs.keys():
            counter_ingrs[base_word] = 1

    counter_ingrs, cluster_ingrs = cluster_ingredients(counter_ingrs)
    counter_ingrs, cluster_ingrs = remove_plurals(counter_ingrs, cluster_ingrs)

    # If the ingredient frequency is less than 'threshold', then the ingredient is discarded.
    ingrs = {word: cnt for word, cnt in counter_ingrs.items() if cnt >= args.threshold_ingrs}

    # Ingredient vocab
    # Create a vocab wrapper for ingredients
    vocab_ingrs = Vocabulary()
    idx = vocab_ingrs.add_word('<end>')
    # this returns the next idx to add words to
    # Add the ingredients to the vocabulary.
    for k, _ in ingrs.items():
        for ingr in cluster_ingrs[k]:
            idx = vocab_ingrs.add_word(ingr, idx)
        idx += 1
    _ = vocab_ingrs.add_word('<pad>', idx)

    print("Total ingr vocabulary size: {}".format(len(vocab_ingrs)))

    dataset = {'train': [], 'val': [], 'test': []}

    ######
    # 2. Tokenize and build dataset based on vocabularies.
    ######
    for i, entry in tqdm(enumerate(layer1)):

        # get all instructions for this recipe
        instrs = entry['instructions']

        instrs_list = []
        ingrs_list = []
        images_list = []

        # retrieve pre-detected ingredients for this entry
        det_ingrs = dets[idx2ind[entry['id']]]['ingredients']
        valid = dets[idx2ind[entry['id']]]['valid']
        labels = []

        for j, det_ingr in enumerate(det_ingrs):
            if len(det_ingr) > 0 and valid[j]:
                det_ingr_undrs = get_ingredient(det_ingr, replace_dict_ingrs)
                ingrs_list.append(det_ingr_undrs)
                label_idx = vocab_ingrs(det_ingr_undrs)
                if label_idx is not vocab_ingrs('<pad>') and label_idx not in labels:
                    labels.append(label_idx)

        # get raw text for instructions of this entry
        acc_len = 0
        for instr in instrs:
            instr = instr['text']
            instr = get_instruction(instr, replace_dict_instrs)
            if len(instr) > 0:
                acc_len += len(instr)
                instrs_list.append(instr)

        # we discard recipes with too many or too few ingredients or instruction words
        if len(labels) < args.minnumingrs or len(instrs_list) < args.minnuminstrs \
                or len(instrs_list) >= args.maxnuminstrs or len(labels) >= args.maxnumingrs \
                or acc_len < args.minnumwords:
            continue

        if entry['id'] in id2im.keys():
            ims = layer2[id2im[entry['id']]]

            # copy image paths for this recipe
            for im in ims['images']:
                images_list.append(im['id'])

        newentry = {
            'id': entry['id'],
            'ingredients': ingrs_list,
            'images': images_list,
        }
        dataset[entry['partition']].append(newentry)

    print('Dataset size:')
    for split in dataset.keys():
        print(split, ':', len(dataset[split]))

    return vocab_ingrs, dataset