in cli/foundation-models/system/finetune/video-multi-object-tracking/mot2coco.py [0:0]
def main(args):
if not osp.isdir(args.output):
os.makedirs(args.output)
sets = ["train", "test"]
if args.split_train:
sets += ["half-train", "half-val"]
vid_id, img_id, ann_id = 1, 1, 1
for subset in sets:
ins_id = 0
print(f"Converting {subset} set to COCO format")
if "half" in subset:
in_folder = osp.join(args.input, "train")
else:
in_folder = osp.join(args.input, subset)
out_file = osp.join(args.output, f"{subset}_cocoformat.json")
outputs = defaultdict(list)
outputs["categories"] = [dict(id=1, name="pedestrian")]
if args.convert_det:
det_file = osp.join(args.output, f"{subset}_detections.pkl")
detections = dict(det_bboxes=dict())
video_names = os.listdir(in_folder)
for video_name in tqdm(video_names):
# basic params
parse_gt = "test" not in subset
ins_maps = dict()
# load video infos
video_folder = osp.join(in_folder, video_name)
infos = list_from_file(f"{video_folder}/seqinfo.ini")
# video-level infos
assert video_name == infos[1].strip().split("=")[1]
img_folder = infos[2].strip().split("=")[1]
img_names = [
fname
for fname in os.listdir(f"{video_folder}/{img_folder}")
if fname.endswith(".jpg") or fname.endswith(".png")
]
img_names = sorted(img_names)
fps = int(infos[3].strip().split("=")[1])
num_imgs = int(infos[4].strip().split("=")[1])
assert num_imgs == len(img_names)
width = int(infos[5].strip().split("=")[1])
height = int(infos[6].strip().split("=")[1])
video = dict(
id=vid_id, name=video_name, fps=fps, width=width, height=height
)
# parse annotations
if parse_gt:
gts = list_from_file(f"{video_folder}/gt/gt.txt")
if "MOT15" in video_folder:
img2gts = parse_gts(gts, True)
else:
img2gts = parse_gts(gts, False)
if args.convert_det:
dets = list_from_file(f"{video_folder}/det/det.txt")
img2dets = parse_dets(dets)
# make half sets
if "half" in subset:
split_frame = num_imgs // 2 + 1
if "train" in subset:
img_names = img_names[:split_frame]
elif "val" in subset:
img_names = img_names[split_frame:]
else:
raise ValueError("subset must be named with `train` or `val`")
mot_frame_ids = [str(int(_.split(".")[0])) for _ in img_names]
with open(f"{video_folder}/gt/gt_{subset}.txt", "wt") as f:
for gt in gts:
if gt.split(",")[0] in mot_frame_ids:
f.writelines(f"{gt}\n")
# image and box level infos
for frame_id, name in enumerate(img_names):
img_name = osp.join(video_name, img_folder, name)
mot_frame_id = int(name.split(".")[0])
image = dict(
id=img_id,
video_id=vid_id,
file_name=img_name,
height=height,
width=width,
frame_id=frame_id,
mot_frame_id=mot_frame_id,
)
if parse_gt:
gts = img2gts[mot_frame_id]
for gt in gts:
gt.update(id=ann_id, image_id=img_id)
mot_ins_id = gt["mot_instance_id"]
if mot_ins_id in ins_maps:
gt["instance_id"] = ins_maps[mot_ins_id]
else:
gt["instance_id"] = ins_id
ins_maps[mot_ins_id] = ins_id
ins_id += 1
outputs["annotations"].append(gt)
ann_id += 1
if args.convert_det:
dets = np.array(img2dets[mot_frame_id])
if dets.ndim == 1:
assert len(dets) == 0
dets = np.zeros((0, 5))
detections["det_bboxes"][img_name] = [dets]
outputs["images"].append(image)
img_id += 1
outputs["videos"].append(video)
vid_id += 1
outputs["num_instances"] = ins_id
print(f"{subset} has {ins_id} instances.")
dump_to_json(outputs, out_file)
if args.convert_det:
dump_to_pickle(detections, det_file)
print(f"Done! Saved as {out_file} and {det_file}")
else:
print(f"Done! Saved as {out_file}")