in src/utils/recipe1m_utils.py [0:0]
def build_vocab_recipe1m(args):
print("Loading data...")
args.save_path = os.path.join(args.recipe1m_path, 'preprocessed')
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
dets = json.load(open(os.path.join(args.recipe1m_path, 'det_ingrs.json'), 'r'))
layer1 = json.load(open(os.path.join(args.recipe1m_path, 'layer1.json'), 'r'))
layer2 = json.load(open(os.path.join(args.recipe1m_path, 'layer2.json'), 'r'))
id2im = {}
for i, entry in enumerate(layer2):
id2im[entry['id']] = i
print("Loaded data.")
print("Found %d recipes in the dataset." % (len(layer1)))
replace_dict_ingrs = {'and': ['&', "'n"], '': ['%', ',', '.', '#', '[', ']', '!', '?']}
replace_dict_instrs = {'and': ['&', "'n"], '': ['#', '[', ']']}
idx2ind = {}
for i, entry in enumerate(dets):
idx2ind[entry['id']] = i
ingrs_file = args.save_path + 'allingrs_count.pkl'
#####
# 1. Count words in dataset and clean
#####
if os.path.exists(ingrs_file) and not args.forcegen:
print("loading pre-extracted word counters")
counter_ingrs = pickle.load(open(args.save_path + 'allingrs_count.pkl', 'rb'))
else:
counter_ingrs = Counter()
for i, entry in tqdm(enumerate(layer1)):
# get all instructions for this recipe
instrs = entry['instructions']
instrs_list = []
ingrs_list = []
# retrieve pre-detected ingredients for this entry
det_ingrs = dets[idx2ind[entry['id']]]['ingredients']
valid = dets[idx2ind[entry['id']]]['valid']
det_ingrs_filtered = []
for j, det_ingr in enumerate(det_ingrs):
if len(det_ingr) > 0 and valid[j]:
det_ingr_undrs = get_ingredient(det_ingr, replace_dict_ingrs)
det_ingrs_filtered.append(det_ingr_undrs)
ingrs_list.append(det_ingr_undrs)
# get raw text for instructions of this entry
acc_len = 0
for instr in instrs:
instr = instr['text']
instr = get_instruction(instr, replace_dict_instrs)
if len(instr) > 0:
instrs_list.append(instr)
acc_len += len(instr)
# discard recipes with too few or too many ingredients or instruction words
if len(ingrs_list) < args.minnumingrs or len(instrs_list) < args.minnuminstrs \
or len(instrs_list) >= args.maxnuminstrs or len(ingrs_list) >= args.maxnumingrs \
or acc_len < args.minnumwords:
continue
if entry['partition'] == 'train':
counter_ingrs.update(ingrs_list)
pickle.dump(counter_ingrs, open(args.save_path + 'allingrs_count.pkl', 'wb'))
# manually add missing entries for better clustering
base_words = [
'peppers', 'tomato', 'spinach_leaves', 'turkey_breast', 'lettuce_leaf', 'chicken_thighs',
'milk_powder', 'bread_crumbs', 'onion_flakes', 'red_pepper', 'pepper_flakes',
'juice_concentrate', 'cracker_crumbs', 'hot_chili', 'seasoning_mix', 'dill_weed',
'pepper_sauce', 'sprouts', 'cooking_spray', 'cheese_blend', 'basil_leaves',
'pineapple_chunks', 'marshmallow', 'chile_powder', 'cheese_blend', 'corn_kernels',
'tomato_sauce', 'chickens', 'cracker_crust', 'lemonade_concentrate', 'red_chili',
'mushroom_caps', 'mushroom_cap', 'breaded_chicken', 'frozen_pineapple', 'pineapple_chunks',
'seasoning_mix', 'seaweed', 'onion_flakes', 'bouillon_granules', 'lettuce_leaf',
'stuffing_mix', 'parsley_flakes', 'chicken_breast', 'basil_leaves', 'baguettes',
'green_tea', 'peanut_butter', 'green_onion', 'fresh_cilantro', 'breaded_chicken',
'hot_pepper', 'dried_lavender', 'white_chocolate', 'dill_weed', 'cake_mix', 'cheese_spread',
'turkey_breast', 'chucken_thighs', 'basil_leaves', 'mandarin_orange', 'laurel',
'cabbage_head', 'pistachio', 'cheese_dip', 'thyme_leave', 'boneless_pork', 'red_pepper',
'onion_dip', 'skinless_chicken', 'dark_chocolate', 'canned_corn', 'muffin', 'cracker_crust',
'bread_crumbs', 'frozen_broccoli', 'philadelphia', 'cracker_crust', 'chicken_breast'
]
for base_word in base_words:
if base_word not in counter_ingrs.keys():
counter_ingrs[base_word] = 1
counter_ingrs, cluster_ingrs = cluster_ingredients(counter_ingrs)
counter_ingrs, cluster_ingrs = remove_plurals(counter_ingrs, cluster_ingrs)
# If the ingredient frequency is less than 'threshold', then the ingredient is discarded.
ingrs = {word: cnt for word, cnt in counter_ingrs.items() if cnt >= args.threshold_ingrs}
# Ingredient vocab
# Create a vocab wrapper for ingredients
vocab_ingrs = Vocabulary()
idx = vocab_ingrs.add_word('<end>')
# this returns the next idx to add words to
# Add the ingredients to the vocabulary.
for k, _ in ingrs.items():
for ingr in cluster_ingrs[k]:
idx = vocab_ingrs.add_word(ingr, idx)
idx += 1
_ = vocab_ingrs.add_word('<pad>', idx)
print("Total ingr vocabulary size: {}".format(len(vocab_ingrs)))
dataset = {'train': [], 'val': [], 'test': []}
######
# 2. Tokenize and build dataset based on vocabularies.
######
for i, entry in tqdm(enumerate(layer1)):
# get all instructions for this recipe
instrs = entry['instructions']
instrs_list = []
ingrs_list = []
images_list = []
# retrieve pre-detected ingredients for this entry
det_ingrs = dets[idx2ind[entry['id']]]['ingredients']
valid = dets[idx2ind[entry['id']]]['valid']
labels = []
for j, det_ingr in enumerate(det_ingrs):
if len(det_ingr) > 0 and valid[j]:
det_ingr_undrs = get_ingredient(det_ingr, replace_dict_ingrs)
ingrs_list.append(det_ingr_undrs)
label_idx = vocab_ingrs(det_ingr_undrs)
if label_idx is not vocab_ingrs('<pad>') and label_idx not in labels:
labels.append(label_idx)
# get raw text for instructions of this entry
acc_len = 0
for instr in instrs:
instr = instr['text']
instr = get_instruction(instr, replace_dict_instrs)
if len(instr) > 0:
acc_len += len(instr)
instrs_list.append(instr)
# we discard recipes with too many or too few ingredients or instruction words
if len(labels) < args.minnumingrs or len(instrs_list) < args.minnuminstrs \
or len(instrs_list) >= args.maxnuminstrs or len(labels) >= args.maxnumingrs \
or acc_len < args.minnumwords:
continue
if entry['id'] in id2im.keys():
ims = layer2[id2im[entry['id']]]
# copy image paths for this recipe
for im in ims['images']:
images_list.append(im['id'])
newentry = {
'id': entry['id'],
'ingredients': ingrs_list,
'images': images_list,
}
dataset[entry['partition']].append(newentry)
print('Dataset size:')
for split in dataset.keys():
print(split, ':', len(dataset[split]))
return vocab_ingrs, dataset