Source code for dgs.models.similarity.pose_similarity

"""
Modules for computing the similarity between two poses.
"""

import torch as t
from torchvision.ops import box_area, box_iou
from torchvision.transforms.v2 import ConvertBoundingBoxFormat
from torchvision.tv_tensors import BoundingBoxes, BoundingBoxFormat

from dgs.models.similarity.similarity import SimilarityModule
from dgs.utils.config import DEF_VAL
from dgs.utils.constants import OKS_SIGMAS
from dgs.utils.state import State
from dgs.utils.types import Config, NodePath, Validations

oks_validations: Validations = {
    "format": [str, ("in", list(OKS_SIGMAS.keys()))],
    # optional
    "keypoint_dim": ["optional", int, ("within", (1, 3))],
}

iou_validations: Validations = {}



[docs]
class ObjectKeypointSimilarity(SimilarityModule):
    """Compute the object key-point similarity (OKS) between two batches of poses / States.

    Params
    ------

    format (str):
        The key point format, e.g., 'coco', 'coco-whole', ... has to be in OKS_SIGMAS.keys().

    Optional Params
    ---------------

    keypoint_dim (int, optional):
        The dimensionality of the key points. So whether 2D or 3D is expected.
        Default ``DEF_VAL.similarity.oks.kp_dim``.
    """


[docs]
    def __init__(self, config: Config, path: NodePath):
        super().__init__(config, path)

        self.validate_params(oks_validations)

        # get sigma
        sigma: t.Tensor = OKS_SIGMAS[self.params["format"]].to(device=self.device, dtype=self.precision)
        # With k = 2 * sigma -> shape [J]
        # We know that k is constant and k^2 is only ever required. Therefore, save it as parameter / buffer.
        self.register_buffer("k2", t.square(t.mul(2, sigma)))

        # Create a small value for epsilon to make sure that we do not divide by zero later on.
        self.register_buffer("eps", t.tensor(t.finfo(self.precision).eps, device=self.device, dtype=self.precision))
        # Set up a transform function to convert the bounding boxes if they have the wrong format
        self.transf_bbox_to_xyxy = ConvertBoundingBoxFormat("XYXY")

        self.kp_dim: int = self.params.get("keypoint_dim", DEF_VAL["similarity"]["oks"]["kp_dim"])



[docs]
    def get_data(self, ds: State) -> t.Tensor:
        """Given a :class:`.State`, compute the detected / predicted key points with shape ``[B1 x J x 2|3]``
        and the areas of the respective ground-truth bounding-boxes with shape ``[B1]``.
        """
        return ds.keypoints.float().view(ds.B, -1, self.kp_dim)



[docs]
    def get_area(self, ds: State) -> t.Tensor:
        """Given a :class:`.State`, compute the area of the bounding box."""
        bboxes = ds.bbox

        if bboxes.format == BoundingBoxFormat.XYXY:
            area = box_area(bboxes).float()  # (x2-x1) * (y2-y1)
        elif bboxes.format == BoundingBoxFormat.XYWH:
            area = bboxes[:, -2] * bboxes[:, -1]  # w * h
        else:
            bboxes = self.transf_bbox_to_xyxy(bboxes)
            area = box_area(bboxes).float()

        return area



[docs]
    def get_target(self, ds: State) -> tuple[t.Tensor, t.Tensor]:
        """Given a :class:`.State` obtain the ground truth key points and the key-point-visibility.
        Both are tensors, the key points are a FloatTensor of shape ``[B2 x J x 2|3]``
        and the visibility is a BoolTensor of shape ``[B2 x J]``.
        """
        kps = ds.keypoints.float().view(ds.B, -1, self.kp_dim)
        vis = ds.cast_joint_weight(dtype=t.bool).squeeze(-1).view(ds.B, -1)
        return kps, vis



[docs]
    def forward(self, data: State, target: State) -> t.Tensor:
        r"""Compute the object key-point similarity between a ground truth label and detected key points.

        There has to be one key point of the label for any detection. (Batch sizes have to match)

        Notes:
            Compute the key-point similarity :math:`\mathtt{ks}_i` for every joint between every detection and the
            respective ground truth annotation.

            .. math::
                \mathtt{ks}_i = \exp(-\dfrac{d_i^2}{2s^2k_i^2})

            The key-point similarity :math:`\mathtt{OKS}` is then computed as the weighted sum
            using the key-point visibilities as weights.

            .. math::
                \mathtt{OKS} = \dfrac{\sum_i \mathtt{ks}_i \cdot \delta (v_i > 0)}{\sum_i \delta (v_i > 0)}

            * :math:`d_i` the euclidean distance between the ground truth and detected key point
            * :math:`k_i` the constant for the key point, computed as :math:`k=2\cdot\sigma`
            * :math:`v_i` the visibility of the key point, with
                * 0 = unlabeled
                * 1 = labeled but not visible
                * 2 = labeled but visible
            * :math:`s` the scale of the ground truth object, with :math:`s^2` becoming the object's segmented area

        Args:
            data: A :class:`.State` object containing at least the key points and the bounding box. Shape ``N``.
            target: A :class:`.State` containing at least the target key points. Shape ``T``.

        Returns:
            A (Float)Tensor of shape ``[N x T]`` with values in ``[0..1]``.
            If requested, the softmax is computed along the -1 dimension,
            resulting in probability distributions for each value of the input data.
        """
        # get predicted key-points as [N x J x 2] and bbox area as [N]
        pred_kps = self.get_data(ds=data)
        bbox_area = self.get_area(ds=data)
        # get ground-truth key-points as [T x J x 2] and visibility as [T x J]
        gt_kps, gt_vis = self.get_target(ds=target)
        assert pred_kps.size(-1) == gt_kps.size(-1), "Key-points should have the same number of dimensions"
        # Compute d = Euclidean dist, but don't compute the sqrt, because only d^2 is required.
        # A little tensor magic, because if N != T and N != 1 and T != 1, regular subtraction will fail!
        # Therefore, modify the tensors to have shape [N x J x 2 x 1], [(1 x) J x 2 x T].
        # The output has shape [N x J x 2 x T], then square and sum over the number of dimensions (-2).
        d2 = t.sum(
            t.sub(pred_kps.unsqueeze(-1), gt_kps.permute(1, 2, 0)).square(),
            dim=-2,
        )  # -> [N x J x T]
        # Ground truth scale as bounding box area in relation to the image area it lies within.
        # Keep area s^2, because s is never used.
        s2 = bbox_area.flatten()  # [N]
        # Keypoint similarity for every key-point pair of ground truth and detected.
        # Use outer product to combine s^2 [N] with k^2 [J] and add epsilon to make sure to have non-zero values.
        # Again, modify the tensor shapes to match for division.
        # Shapes: d2 [N x J x T], new_outer [N x J x 1]
        ks = t.exp(-t.div(d2, (2 * t.outer(s2, self.k2) + self.eps).unsqueeze(-1)))  # -> [N x J x T]
        # The count of non-zero visibilities in the ground-truth
        count = t.count_nonzero(gt_vis, dim=-1)  # [T]
        # with ks [N x J x T], sum over all J and divide by the nof visibilities
        return self.softmax(t.div(t.where(gt_vis.T, ks, 0).sum(dim=-2), count).nan_to_num_(nan=0.0, posinf=0.0))





[docs]
class IntersectionOverUnion(SimilarityModule):
    """Use the bounding-box based intersection-over-union as a similarity metric.

    Params
    ------

    """


[docs]
    def __init__(self, config: Config, path: NodePath):
        super().__init__(config, path)

        self.bbox_transform = ConvertBoundingBoxFormat("XYXY")



[docs]
    def get_data(self, ds: State) -> BoundingBoxes:
        """Given a :class:`.State` obtain the ground-truth bounding-boxes as
        :class:`torchvision.tv_tensors.BoundingBoxes` object of size ``[N x 4]``.

        Notes:
            The box_iou function expects that the bounding boxes are in the 'XYXY' format.
        """
        bboxes = ds.bbox
        if bboxes.format != BoundingBoxFormat.XYXY:
            bboxes = self.bbox_transform(bboxes)
        return bboxes



[docs]
    def get_target(self, ds: State) -> BoundingBoxes:
        """Given a :class:`.State` obtain the ground-truth bounding-boxes as
        :class:`torchvision.tv_tensors.BoundingBoxes` object of size ``[T x 4]``.

        Notes:
            The function :func:`box_iou` expects that the bounding boxes are in the 'XYXY' format.
        """
        bboxes = ds.bbox
        if bboxes.format != BoundingBoxFormat.XYXY:
            bboxes = self.bbox_transform(bboxes)
        return bboxes



[docs]
    def forward(self, data: State, target: State) -> t.Tensor:
        """Given two states containing bounding-boxes, compute the intersection over union between each pair.

        Args:
            data: A :class:`.State` object containing the detected bounding-boxes. Size ``N``
            target: A :class:`.State` object containing the target bounding-boxes. Size ``T``

        Returns:
            A (Float)Tensor of shape ``[N x T]`` with values in ``[0..1]``.
            If requested, the softmax is computed along the -1 dimension,
            resulting in probability distributions for each value of the input data.
        """
        return self.softmax(box_iou(self.get_data(ds=data), self.get_target(ds=target)))