Spaces:

Mountchicken
/

Rex-Omni

Running on Zero

File size: 4,671 Bytes

a5c4c58

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Task definitions and configurations for Rex Omni
"""

from dataclasses import dataclass
from enum import Enum
from typing import Dict, List, Optional


class TaskType(Enum):
    """Supported task types"""

    DETECTION = "detection"
    POINTING = "pointing"
    VISUAL_PROMPTING = "visual_prompting"
    KEYPOINT = "keypoint"
    OCR_BOX = "ocr_box"
    OCR_POLYGON = "ocr_polygon"
    GUI_DETECTION = "gui_grounding"
    GUI_POINTING = "gui_pointing"


@dataclass
class TaskConfig:
    """Configuration for a specific task"""

    name: str
    prompt_template: str
    description: str
    output_format: str
    requires_categories: bool = True
    requires_visual_prompt: bool = False
    requires_keypoint_type: bool = False


# Task configurations
TASK_CONFIGS: Dict[TaskType, TaskConfig] = {
    TaskType.DETECTION: TaskConfig(
        name="Detection",
        prompt_template="Detect {categories}. Output the bounding box coordinates in [x0, y0, x1, y1] format.",
        description="Detect objects and return in bounding box format",
        output_format="boxes",
        requires_categories=True,
    ),
    TaskType.POINTING: TaskConfig(
        name="Pointing",
        prompt_template="Point to {categories}.",
        description="Point to objects and return in point format",
        output_format="points",
        requires_categories=True,
    ),
    TaskType.VISUAL_PROMPTING: TaskConfig(
        name="Visual Prompting",
        prompt_template="Given reference boxes {visual_prompt} indicating one or more objects, find all similar objects in the image and output their bounding boxes.",
        description="Ground visual prompts to image regions",
        output_format="boxes",
        requires_categories=False,
        requires_visual_prompt=True,
    ),
    TaskType.KEYPOINT: TaskConfig(
        name="Keypoint",
        prompt_template="Can you detect each {categories} in the image using a [x0, y0, x1, y1] box format, and then provide the coordinates of its {keypoints} as [x0, y0]? Output the answer in JSON format.",
        description="Detect keypoints for specific object types",
        output_format="keypoints",
        requires_categories=True,
        requires_keypoint_type=True,
    ),
    TaskType.OCR_BOX: TaskConfig(
        name="OCR Box",
        prompt_template="Detect all {categories} and recognize them.",
        description="Detect text in bounding boxes and recognize",
        output_format="boxes_with_text",
        requires_categories=True,
    ),
    TaskType.OCR_POLYGON: TaskConfig(
        name="OCR Polygon",
        prompt_template="Can you detect all {categories} in this image in polygon format like [x0, y0, x1, y1, x2, y2 ...] and then recognize them?",
        description="Detect text in polygons and recognize",
        output_format="polygons_with_text",
        requires_categories=True,
    ),
    TaskType.GUI_DETECTION: TaskConfig(
        name="GUI Detection",
        prompt_template='Detect element "{categories}"" in the image.',
        description="Detect GUI elements and return in bounding box format",
        output_format="boxes",
        requires_categories=True,
    ),
    TaskType.GUI_POINTING: TaskConfig(
        name="GUI Pointing",
        prompt_template='Point to element "{categories}".',
        description="Point to GUI elements and return in point format",
        output_format="points",
        requires_categories=True,
    ),
}


# Keypoint definitions for different object types
KEYPOINT_CONFIGS = {
    "person": [
        "nose",
        "left eye",
        "right eye",
        "left ear",
        "right ear",
        "left shoulder",
        "right shoulder",
        "left elbow",
        "right elbow",
        "left wrist",
        "right wrist",
        "left hip",
        "right hip",
        "left knee",
        "right knee",
        "left ankle",
        "right ankle",
    ],
    "animal": [
        "left eye",
        "right eye",
        "nose",
        "neck",
        "root of tail",
        "left shoulder",
        "left elbow",
        "left front paw",
        "right shoulder",
        "right elbow",
        "right front paw",
        "left hip",
        "left knee",
        "left back paw",
        "right hip",
        "right knee",
        "right back paw",
    ],
}


def get_task_config(task_type: TaskType) -> TaskConfig:
    """Get configuration for a task type"""
    return TASK_CONFIGS[task_type]


def get_keypoint_config(keypoint_type: str) -> Optional[List[str]]:
    """Get keypoint configuration for a specific type"""
    return KEYPOINT_CONFIGS.get(keypoint_type)