def df_to_coco

def df_to_coco_annos()

in libs/solaris/data/coco.py [0:0]
99 lines of code
15 McCabe index (conditional complexity)

def df_to_coco_annos(df, output_path=None, geom_col='geometry',
                     image_id_col=None, category_col=None, score_col=None,
                     preset_categories=None, supercategory_col=None,
                     include_other=True, starting_id=1, verbose=0):
    """Extract COCO-formatted annotations from a pandas ``DataFrame``.

    This function assumes that *annotations are already in pixel coordinates.*
    If this is not the case, you can transform them using
    :func:`solaris.vector.polygon.geojson_to_px_gdf`.

    Note that this function generates annotations formatted per the COCO object
    detection specification. For additional information, see
    `the COCO dataset specification`_.

    .. _the COCO dataset specification: http://cocodataset.org/#format-data

    Arguments
    ---------
    df : :class:`pandas.DataFrame`
        A :class:`pandas.DataFrame` containing geometries to store as annos.
    image_id_col : str, optional
        The column containing image IDs. If not provided, it's assumed that
        all are in the same image, which will be assigned the ID of ``1``.
    geom_col : str, optional
        The name of the column in `df` that contains geometries. The geometries
        should either be shapely :class:`shapely.geometry.Polygon` s or WKT
        strings. Defaults to ``"geometry"``.
    category_col : str, optional
        The name of the column that specifies categories for each object. If
        not provided, all objects will be placed in a single category named
        ``"other"``.
    score_col : str, optional
        The name of the column that specifies the ouptut confidence of a model.
        If not provided, will not be output.
    preset_categories : :class:`list` of :class:`dict`s, optional
        A pre-set list of categories to use for labels. These categories should
        be formatted per
        `the COCO category specification`_.
    starting_id : int, optional
        The number to start numbering annotation IDs at. Defaults to ``1``.
    verbose : int, optional
        Verbose text output. By default, none is provided; if ``True`` or
        ``1``, information-level outputs are provided; if ``2``, extremely
        verbose text is output.

    .. _the COCO category specification: http://cocodataset.org/#format-data

    Returns
    -------
    output_dict : dict
        A dictionary containing COCO-formatted annotation and category entries
        per the `COCO dataset specification`_
    """
    logger = logging.getLogger(__name__)
    logger.setLevel(_get_logging_level(int(verbose)))
    logger.debug('Checking that df is loaded.')
    df = _check_df_load(df)
    temp_df = df.copy()  # for manipulation
    if preset_categories is not None and category_col is None:
        logger.debug('preset_categories has a value, category_col is None.')
        raise ValueError('category_col must be specified if using'
                         ' preset_categories.')
    elif preset_categories is not None and category_col is not None:
        logger.debug('Both preset_categories and category_col have values.')
        logger.debug('Getting list of category names.')
        category_dict = _coco_category_name_id_dict_from_list(
            preset_categories)
        category_names = list(category_dict.keys())
        if not include_other:
            logger.info('Filtering out objects not contained in '
                        ' preset_categories')
            temp_df = temp_df.loc[temp_df[category_col].isin(category_names),
                                  :]
        else:
            logger.info('Setting category to "other" for objects outside of '
                        'preset category list.')
            temp_df.loc[~temp_df[category_col].isin(category_names),
                        category_col] = 'other'
            if 'other' not in category_dict.keys():
                logger.debug('Adding "other" to category_dict.')
                other_id = np.array(list(category_dict.values())).max() + 1
                category_dict['other'] = other_id
                preset_categories.append({'id': other_id,
                                          'name': 'other',
                                          'supercategory': 'other'})
    elif preset_categories is None and category_col is not None:
        logger.debug('No preset_categories, have category_col.')
        logger.info(f'Collecting unique category names from {category_col}.')
        category_names = list(temp_df[category_col].unique())
        logger.info('Generating category ID numbers arbitrarily.')
        category_dict = {k: v for k, v in zip(category_names,
                                              range(1, len(category_names)+1))}
    else:
        logger.debug('No category column or preset categories.')
        logger.info('Setting category to "other" for all objects.')
        category_col = 'category_col'
        temp_df[category_col] = 'other'
        category_names = ['other']
        category_dict = {'other': 1}

    if image_id_col is None:
        temp_df['image_id'] = 1
    else:
        temp_df.rename(columns={image_id_col: 'image_id'})
    logger.debug('Checking geometries.')
    temp_df[geom_col] = temp_df[geom_col].apply(_check_geom)
    logger.info('Getting area of geometries.')
    temp_df['area'] = temp_df[geom_col].apply(lambda x: x.area)
    logger.info('Getting geometry bounding boxes.')
    temp_df['bbox'] = temp_df[geom_col].apply(
        lambda x: bbox_corners_to_coco(x.bounds))
    temp_df['category_id'] = temp_df[category_col].map(category_dict)
    temp_df['annotation_id'] = list(range(starting_id,
                                          starting_id + len(temp_df)))
    if score_col is not None:
        temp_df['score'] = df[score_col]

    def _row_to_coco(row, geom_col, category_id_col, image_id_col, score_col):
        "get a single annotation record from a row of temp_df."
        if score_col is None:
            
            return {'id': row['annotation_id'],
                    'image_id': int(row[image_id_col]),
                    'category_id': int(row[category_id_col]),
                    'segmentation': [polygon_to_coco(row[geom_col])],
                    'area': row['area'],
                    'bbox': row['bbox'],
                    'iscrowd': 0}
        else:
            return {'id': row['annotation_id'],
                    'image_id': int(row[image_id_col]),
                    'category_id': int(row[category_id_col]),
                    'segmentation': [polygon_to_coco(row[geom_col])],
                    'score': float(row[score_col]),
                    'area': row['area'],
                    'bbox': row['bbox'],
                    'iscrowd': 0}

    coco_annotations = temp_df.apply(_row_to_coco, axis=1, geom_col=geom_col,
                                     category_id_col='category_id',
                                     image_id_col=image_id_col,
                                     score_col=score_col).tolist()
    coco_categories = coco_categories_dict_from_df(
        temp_df, category_id_col='category_id',
        category_name_col=category_col,
        supercategory_col=supercategory_col)

    output_dict = {'annotations': coco_annotations,
                   'categories': coco_categories}

    if output_path is not None:
        with open(output_path, 'w') as outfile:
            json.dump(output_dict, outfile)

    return output_dict