def main()

in cli/foundation-models/system/finetune/video-multi-object-tracking/mot2coco.py [0:0]


def main(args):
    if not osp.isdir(args.output):
        os.makedirs(args.output)

    sets = ["train", "test"]
    if args.split_train:
        sets += ["half-train", "half-val"]
    vid_id, img_id, ann_id = 1, 1, 1

    for subset in sets:
        ins_id = 0
        print(f"Converting {subset} set to COCO format")
        if "half" in subset:
            in_folder = osp.join(args.input, "train")
        else:
            in_folder = osp.join(args.input, subset)
        out_file = osp.join(args.output, f"{subset}_cocoformat.json")
        outputs = defaultdict(list)
        outputs["categories"] = [dict(id=1, name="pedestrian")]
        if args.convert_det:
            det_file = osp.join(args.output, f"{subset}_detections.pkl")
            detections = dict(det_bboxes=dict())
        video_names = os.listdir(in_folder)
        for video_name in tqdm(video_names):
            # basic params
            parse_gt = "test" not in subset
            ins_maps = dict()
            # load video infos
            video_folder = osp.join(in_folder, video_name)
            infos = list_from_file(f"{video_folder}/seqinfo.ini")
            # video-level infos
            assert video_name == infos[1].strip().split("=")[1]
            img_folder = infos[2].strip().split("=")[1]
            img_names = [
                fname
                for fname in os.listdir(f"{video_folder}/{img_folder}")
                if fname.endswith(".jpg") or fname.endswith(".png")
            ]
            img_names = sorted(img_names)
            fps = int(infos[3].strip().split("=")[1])
            num_imgs = int(infos[4].strip().split("=")[1])
            assert num_imgs == len(img_names)
            width = int(infos[5].strip().split("=")[1])
            height = int(infos[6].strip().split("=")[1])
            video = dict(
                id=vid_id, name=video_name, fps=fps, width=width, height=height
            )
            # parse annotations
            if parse_gt:
                gts = list_from_file(f"{video_folder}/gt/gt.txt")
                if "MOT15" in video_folder:
                    img2gts = parse_gts(gts, True)
                else:
                    img2gts = parse_gts(gts, False)
            if args.convert_det:
                dets = list_from_file(f"{video_folder}/det/det.txt")
                img2dets = parse_dets(dets)
            # make half sets
            if "half" in subset:
                split_frame = num_imgs // 2 + 1
                if "train" in subset:
                    img_names = img_names[:split_frame]
                elif "val" in subset:
                    img_names = img_names[split_frame:]
                else:
                    raise ValueError("subset must be named with `train` or `val`")
                mot_frame_ids = [str(int(_.split(".")[0])) for _ in img_names]
                with open(f"{video_folder}/gt/gt_{subset}.txt", "wt") as f:
                    for gt in gts:
                        if gt.split(",")[0] in mot_frame_ids:
                            f.writelines(f"{gt}\n")
            # image and box level infos
            for frame_id, name in enumerate(img_names):
                img_name = osp.join(video_name, img_folder, name)
                mot_frame_id = int(name.split(".")[0])
                image = dict(
                    id=img_id,
                    video_id=vid_id,
                    file_name=img_name,
                    height=height,
                    width=width,
                    frame_id=frame_id,
                    mot_frame_id=mot_frame_id,
                )
                if parse_gt:
                    gts = img2gts[mot_frame_id]
                    for gt in gts:
                        gt.update(id=ann_id, image_id=img_id)
                        mot_ins_id = gt["mot_instance_id"]
                        if mot_ins_id in ins_maps:
                            gt["instance_id"] = ins_maps[mot_ins_id]
                        else:
                            gt["instance_id"] = ins_id
                            ins_maps[mot_ins_id] = ins_id
                            ins_id += 1
                        outputs["annotations"].append(gt)
                        ann_id += 1
                if args.convert_det:
                    dets = np.array(img2dets[mot_frame_id])
                    if dets.ndim == 1:
                        assert len(dets) == 0
                        dets = np.zeros((0, 5))
                    detections["det_bboxes"][img_name] = [dets]
                outputs["images"].append(image)
                img_id += 1
            outputs["videos"].append(video)
            vid_id += 1
            outputs["num_instances"] = ins_id
        print(f"{subset} has {ins_id} instances.")
        dump_to_json(outputs, out_file)
        if args.convert_det:
            dump_to_pickle(detections, det_file)
            print(f"Done! Saved as {out_file} and {det_file}")
        else:
            print(f"Done! Saved as {out_file}")