in mm_action_prediction/tools/embed_fashion_assets.py [0:0]
def main(args):
with open(args["input_asset_file"], "r") as file_id:
assets = json.load(file_id)
# Select and embed only the top attributes.
cleaned_assets = []
for image_id, asset in assets.items():
clean_asset = {}
asset_info = asset["metadata"]
for key in EMBED_ATTRIBUTES:
if key in asset_info:
val = asset_info[key]
# val = correction.get(val, val).lower()
val = ast.literal_eval(val) if "[" in val else val
clean_asset[key] = val if isinstance(val, list) else [val]
clean_asset["id"] = int(image_id)
cleaned_assets.append(clean_asset)
# Vocabulary for each field.
vocabulary = {key: {} for key in EMBED_ATTRIBUTES}
for asset in cleaned_assets:
for attr in EMBED_ATTRIBUTES:
attr_val = asset.get(attr, [])
for val in attr_val:
vocabulary[attr][val] = vocabulary[attr].get(val, 0) + 1
# Embedding for each item.
nlp = spacy.load(args["spacy_model"])
sample_feature = nlp("apple").vector
feature_size = sample_feature.size
zero_features = np.zeros(feature_size)
embeddings = []
id_list = []
for asset in cleaned_assets:
embed_vector = []
for attr in EMBED_ATTRIBUTES:
if attr in asset and len(asset[attr]) > 0:
attr_val = asset[attr]
feature_vector = np.stack(
[nlp(val).vector for val in attr_val]
).mean(0)
else:
feature_vector = zero_features
embed_vector.append(feature_vector)
embeddings.append(np.concatenate(embed_vector))
id_list.append(asset["id"])
embeddings = np.stack(embeddings)
print("Saving embeddings: {}".format(args["embed_path"]))
np.save(
args["embed_path"],
{
"asset_id": id_list,
"embedding": embeddings,
"asset_feature_size": embeddings.shape[1],
},
)