scripts/data_prepare.py

# Copyright (c) Alibaba, Inc. and its affiliates. import argparse import os import os.path as osp from os.path import join, exists, isdir, basename, abspath from glob import glob from tqdm import tqdm import random import cv2 import numpy as np import json import warnings import math import torch import torch.nn.functional as F import sys; sys.path.append('./') from utils.general import gaussian2D try: from segment_anything import SamPredictor, sam_model_registry sam_checkpoint = "./weights/sam_vit_h_4b8939.pth" model_type = "vit_h" device = "cuda" sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device) #.half() Warning: Precision Drops dtype = next(sam.named_parameters())[1].dtype predictor = SamPredictor(sam) except: warnings.warn('It is recommended to install segment-anything for better pseudo masks. See instructions in README.md.') predictor = None ############ utils ############ def _readlines(path): with open(path, 'r') as f: lines = f.read().splitlines() return lines def check_break(act): flag = False prev_pos, prev_neg = False, False for x in act: if x: if not prev_pos: prev_pos = True elif prev_neg: flag = True break elif prev_pos: prev_neg = True return flag def check_center(crop): h, w = crop.shape indices = torch.nonzero(crop) yc, xc = indices.float().mean(dim=0) s = 0.15 return ((yc - h/2.).abs() > h * s) | ((xc - w/2.).abs() > w * s) def segment_image(image, labels, width, height): if len(labels) == 0: return torch.zeros(image.shape[:2], dtype=torch.float16).to(device), np.full((0,), False) if max(width, height) > 1024: mask = torch.zeros((height, width), dtype=torch.float16).to(device) invalid = np.full((len(labels),), False) # overlap = 20 # pixel nx, ny = math.ceil(width / 1024), math.ceil(height / 1024) width_, height_ = width // nx, height // ny xc, yc, w, h = labels[:, -4:].T x1, y1, x2, y2 = xc - w / 2., yc - h / 2., xc + w / 2., yc + h / 2. for j in range(ny): for i in range(nx): grid = np.array([i / nx, j / ny, (i+1) / nx, (j+1) / ny], dtype=labels.dtype) indices = (grid[0] < x2) & (x1 < grid[2]) & (grid[1] < y2) & (y1 < grid[3]) if indices.sum() == 0: continue x1_, y1_, x2_, y2_ = (x1[indices] - grid[0]).clip(0, 1/nx), \ (y1[indices] - grid[1]).clip(0, 1/ny), \ (x2[indices] - grid[0]).clip(0, 1/nx), \ (y2[indices] - grid[1]).clip(0, 1/ny) xc_, yc_, w_, h_ = (x1_ + x2_) / 2., (y1_ + y2_) / 2., (x2_ - x1_), (y2_ - y1_) labels_ = np.stack((labels[indices, 0], xc_, yc_, w_, h_), axis=1) x1_, y1_, x2_, y2_ = width_*i, height_*j, width_*(i+1), height_*(j+1) mask_k, invalid_k = segment_image(image[y1_:y2_, x1_:x2_], labels_, width_, height_) mask[y1_:y2_, x1_:x2_] = mask_k invalid[indices] |= invalid_k return mask, invalid c, xc, yc, w, h = labels.T x1, y1, x2, y2 = (xc - w / 2.) * width, (yc - h / 2.) * height, \ (xc + w / 2.) * width, (yc + h / 2.) * height input_boxes = np.stack((x1, y1, x2, y2), axis=1) input_boxes = torch.from_numpy(input_boxes).long().to(device) predictor.set_image(image) transformed_boxes = predictor.transform.apply_boxes_torch(input_boxes, image.shape[:2]).to(dtype) masks, _, _ = predictor.predict_torch( point_coords=None, point_labels=None, boxes=transformed_boxes, multimask_output=False, return_logits=True ) # (batch_size) x (num_predicted_masks_per_input=1) x H x W mask = masks.sigmoid().squeeze(1).max(dim=0)[0].half() invalid = np.full((len(masks),), False) for i, (x1, y1, x2, y2) in enumerate(input_boxes.cpu()): crop = mask[y1:y2, x1:x2] > 0.5 invalid[i] = check_center(crop) | check_break(crop.sum(dim=1)) | check_break(crop.sum(dim=0)) return mask, invalid def gen_mask(label_path, image, cls_ratio=False, thresh=0.5, sam_only=False): if cls_ratio: cls_ratio = [1.83, 5.35, 13.82, 1.00, 5.80, 11.25, 30.11, 44.63, 24.45, 4.89] # train set stride = 1 # area_min, area_max = 4 * 4 * stride * stride, 6 * 6 * stride * stride area_min, area_max = 4 * 4 * 100, 6 * 6 * 50 # for 1920*1080 min_size = 1e6 save_path = label_path.replace('/labels/', '/masks/').replace('.txt', '.npy') os.makedirs(osp.dirname(save_path), exist_ok=True) height, width, _ = image.shape nx, ny = width // stride, height // stride labels = np.loadtxt(label_path, delimiter=' ').reshape(-1, 5) mask = np.zeros((ny, nx), dtype=np.float16) weight = np.ones_like(mask) if predictor is not None: sam_res, invalid = segment_image(image, labels, width, height) if stride != 1: sam_res = F.interpolate(sam_res[None, None, ...].float(), size=(ny, nx), mode='bilinear', align_corners=False)[0, 0] # sam_res = F.interpolate(sam_res[None, None, ...].float(), size=(ny, nx), mode='nearest')[0, 0] sam_res = (sam_res > 0.5).half().numpy() c, xc, yc, w, h = labels.T x1, y1, x2, y2 = ((xc - w / 2.) * nx).astype(np.int32).clip(0), \ ((yc - h / 2.) * ny).astype(np.int32).clip(0), \ np.ceil((xc + w / 2.) * nx).astype(np.int32).clip(0, nx), \ np.ceil((yc + h / 2.) * ny).astype(np.int32).clip(0, ny) input_boxes = np.stack((x1, y1, x2, y2), axis=1) for i, (x1, y1, x2, y2) in enumerate(input_boxes): w, h = x2 - x1, y2 - y1 gaussian = gaussian2D((h, w), sigma=None, thresh=thresh).astype(mask.dtype) if predictor is not None: sam_mask = sam_res[y1:y2, x1:x2].copy() if sam_only: gaussian = sam_mask else: if invalid[i] == 0 and sam_mask.sum() / (w * h) > 0.25: gaussian *= sam_mask np.maximum(gaussian, sam_mask * thresh, out=gaussian) masked_hm = mask[y1:y2, x1:x2] np.maximum(masked_hm, gaussian, out=masked_hm) area = w * h / (width * height) * (1920 * 1080) # min_size = min(min_size, area) r_size = max(area_min / area, 1.) ** 2 # elif area > area_max: # r_size = area / area_max if cls_ratio: r_cls = cls_ratio[int(c[i])] ** 0.7 else: r_cls = 1.0 r = max(r_size, r_cls) masked_wt = weight[y1:y2, x1:x2] curr_wt = np.zeros_like(masked_wt) + math.log(r) + 1. curr_wt *= (gaussian > 0).astype(mask.dtype) np.maximum(masked_wt, curr_wt, out=masked_wt) np.save(save_path, np.stack((mask, weight), axis=-1)) ############ scripts ############ def prepare_visdrone(): name_dict = {'0': 'ignored regions', '1': 'pedestrian', '2': 'people', '3': 'bicycle', '4': 'car', '5': 'van', '6': 'truck', '7': 'tricycle', '8': 'awning-tricycle', '9': 'bus', '10': 'motor', '11': 'others'} split_dict = {'test-dev': 'test-dev.txt', 'val': 'val.txt', 'train': 'train.txt'} root = opt.dataset os.makedirs(join(root, 'split'), exist_ok=True) for sub_dir in glob(join(root, 'VisDrone2019-DET-*')): os.makedirs(join(sub_dir, 'labels'), exist_ok=True) images = sorted(glob(join(sub_dir, 'images', '*.jpg'))) if 'test-challenge' in sub_dir: with open(join(root, 'split', 'test-challenge.txt'), 'w+') as f: f.writelines([line + '\n' for line in images]) continue data_paths = [] for image_path in tqdm(images): image = cv2.imread(image_path) height, width, _ = image.shape label_path = image_path.replace('images', 'annotations').replace('.jpg', '.txt') # if "masked" in label_path: # avoid repeated processing # continue assert exists(label_path) label_lines = [] masked = False # <bbox_left>,<bbox_top>,<bbox_width>,<bbox_height>,<score>,<object_category>,<truncation>,<occlusion> for line in _readlines(label_path): if line[-1] == ',': line = line[:-1] x1, y1, w, h, score, cls, truncation, occlusion = list(map(int, line.split(','))) if cls in [0, 11]: image[y1:y1 + h, x1:x1 + w, :] = 85 masked = True elif truncation < 2 and (occlusion < 2 or True): xc, yc = x1 + w / 2., y1 + h / 2. label_lines.append(('%d' + ' %.6f' * 4 + '\n') % (cls - 1, xc / width, yc / height, w / width, h / height)) else: pass if masked: image_path = image_path.replace('.jpg', '_masked.jpg') cv2.imwrite(image_path, image) # for consistency label_path = image_path.replace('images', 'annotations').replace('.jpg', '.txt') with open(label_path, 'w+') as f: f.writelines(label_lines) gen_mask(label_path, image, cls_ratio=True) data_paths.append(image_path + '\n') with open(join(root, 'split', split_dict[basename(sub_dir)[17:]]), 'w+') as f: f.writelines(data_paths) def prepare_uavdt(): root = opt.dataset data_dir = join(root, 'UAV-benchmark-M') attr_dir = join(root, 'M_attr') label_dir = join(root, 'UAV-benchmark-MOTD_v1.0', 'GT') split_dir = join(root, 'split') # mask images, it takes minutes for i, path in enumerate(glob(join(label_dir, '*ignore.txt'))): labels = np.loadtxt(path, usecols=(0, 2, 3, 4, 5, 8), dtype=int, delimiter=',') vid_name = basename(path).split('_')[0] for frameID, x1, y1, w, h, _ in tqdm(labels, desc='%02d/50' % (i + 1)): masked_path = join(data_dir, vid_name, 'img%06d_masked.jpg' % frameID) input_path = masked_path if exists(masked_path) else masked_path.replace('_masked.jpg', '.jpg') image = cv2.imread(input_path) image[y1:y1 + h, x1:x1 + w] = (127, 127, 127) cv2.imwrite(masked_path, image) data_split = {} for mode in ['train', 'test']: data_split[mode] = [] for path in glob(join(attr_dir, mode, '*.txt')): data_split[mode].append(basename(path)[:5]) k = 10 sep = len(data_split['train']) // k random.shuffle(data_split['train']) data_split['train'], data_split['valid'] = data_split['train'][sep:], data_split['train'][:sep] os.mkdir(split_dir) for mode in ['train', 'valid', 'test']: with open(join(split_dir, '%s_video.txt' % mode), 'w+') as f: f.writelines(vid + '\n' for vid in data_split[mode]) image_paths = [] for video_name in tqdm(data_split[mode], desc=mode): ignore_path = join(label_dir, '%s_gt_ignore.txt' % video_name) label_path = join(label_dir, '%s_gt_whole.txt' % video_name) # the warning caused by empty file doesn't matter ignores = np.loadtxt(ignore_path, usecols=(0, 2, 3, 4, 5), dtype=int, delimiter=',') ignore_dict = {} for frameID, x1, y1, w, h in ignores: xyxy = np.array([[x1, y1, x1 + w, y1 + h]]) if frameID in ignore_dict: ignore_dict[frameID] = np.concatenate((ignore_dict[frameID], xyxy), axis=0) else: ignore_dict[frameID] = xyxy labels = np.loadtxt(label_path, usecols=(0, 2, 3, 4, 5, 8), dtype=int, delimiter=',') label_dict = {} for frameID, x1, y1, w, h, cls in labels: xc, yc = x1 + w / 2., y1 + h / 2. if frameID in ignore_dict: ignore_regions = ignore_dict[frameID] if np.logical_and( np.logical_and(ignore_regions[:, 0] < xc, xc < ignore_regions[:, 2]), np.logical_and(ignore_regions[:, 1] < yc, yc < ignore_regions[:, 3]) ).sum() > 0: continue box = [cls, xc, yc, w, h] if frameID in label_dict: label_dict[frameID].append(box) else: label_dict[frameID] = [box] for frameID, bboxes in label_dict.items(): image_path = join(data_dir, video_name, 'img%06d_masked.jpg' % frameID) if not exists(image_path): image_path = image_path.replace('_masked.jpg', '.jpg') image = cv2.imread(image_path) height, width, _ = image.shape label_path = image_path.replace('.jpg', '.txt') with open(label_path, 'w+') as f: for cls, xc, yc, w, h in bboxes: assert 1 <= cls <= 3 # cls - 1 # treat all categories to one, 'car' f.write('%d %.6f %.6f %.6f %.6f\n' % (0, xc / width, yc / height, w / width, h / height)) gen_mask(label_path, image) image_paths.append(image_path + '\n') with open(join(split_dir, '%s.txt' % mode), 'w+') as f: f.writelines(image_paths) if mode == 'train': with open(join(split_dir, '%s_ds.txt' % mode), 'w+') as f: f.writelines(image_paths[::10]) def prepare_tinyperson(): root = opt.dataset label_file_dict = {'train': join(root, 'mini_annotations', 'tiny_set_train_all_erase.json'), 'test': join(root, 'mini_annotations', 'tiny_set_test_all.json')} image_dir = join(root, 'erase_with_uncertain_dataset') split_dir = join(root, 'split') os.mkdir(split_dir) for mode in ['train', 'test']: with open(label_file_dict[mode], 'r') as f: anno = json.load(f) image_dict = {} for item in anno['images']: file_name, width, height = item['file_name'], item['width'], item['height'] file_path = join(image_dir, mode, file_name) image_dict[item['id']] = {'shape': [width, height], 'bboxes': [], 'image_path': file_path} for item in anno['annotations']: if item['ignore'] or item['uncertain']: continue _id, (x1, y1, w, h) = item['image_id'], item['bbox'] (width, height) = image_dict[_id]['shape'] xc, yc, w, h = (x1 + w / 2.) / width, (y1 + h / 2.) / height, w / width, h / height image_dict[_id]['bboxes'].append('0 %.6f %.6f %.6f %.6f\n' % (xc, yc, w, h)) paths = [] for item in image_dict.values(): image_path = item['image_path'] label_path = item['image_path'][:-4] + '.txt' with open(label_path, 'w+') as f: f.writelines(item['bboxes']) image = cv2.imread(image_path) gen_mask(label_path, image) paths.append(image_path + '\n') if mode == 'train': with open(join(split_dir, 'trainval.txt'), 'w+') as f: f.writelines(paths) k = 10 random.shuffle(paths) sep = len(paths) // k with open(join(split_dir, 'train.txt'), 'w+') as f: f.writelines(paths[sep:]) with open(join(split_dir, 'valid.txt'), 'w+') as f: f.writelines(paths[:sep]) else: with open(join(split_dir, 'test.txt'), 'w+') as f: f.writelines(paths) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='VisDrone', help='dataset, e.g., VisDrone, UAVDT, and TinyPerson') opt = parser.parse_args() assert exists(opt.dataset) dataset = opt.dataset.lower() if 'visdrone' in dataset: prepare_visdrone() elif 'uavdt' in dataset: prepare_uavdt() elif 'tinyperson' in dataset: prepare_tinyperson() else: print('%s is coming soon.' % opt.dataset)

scripts/data_prepare.py (324 lines of code) (raw):