Spaces:

Syzygianinfern0
/

NSVS

Runtime error

App Files Files Community

Syzygianinfern0 commited on Sep 12

Commit

47875a1

0 Parent(s):

Initial clean commit for HF Spaces deployment with LFS

Browse files

Files changed (41) hide show

.gitattributes +2 -0
.gitignore +167 -0
Dockerfile +23 -0
Dockerfile.stormbase +46 -0
README.md +75 -0
build_dependency.sh +40 -0
demo_videos/blue_shirt.mp4 +3 -0
demo_videos/car.mp4 +3 -0
demo_videos/dog_jump.mp4 +3 -0
demo_videos/teaser-gen3.mp4 +3 -0
demo_videos/teaser-pika.mp4 +3 -0
execute_demo.py +424 -0
execute_with_mp4.py +79 -0
execute_with_tlv.py +51 -0
launch_space.sh +22 -0
ns_vfs/model_checker/__init__.py +0 -0
ns_vfs/model_checker/frame_validator.py +86 -0
ns_vfs/model_checker/property_checker.py +38 -0
ns_vfs/model_checker/stormpy.py +225 -0
ns_vfs/model_checker/video_automaton.py +144 -0
ns_vfs/model_checker/video_state.py +80 -0
ns_vfs/nsvs.py +108 -0
ns_vfs/puls/__init__.py +0 -0
ns_vfs/puls/llm.py +44 -0
ns_vfs/puls/prompts.py +62 -0
ns_vfs/puls/puls.py +58 -0
ns_vfs/video/frame.py +50 -0
ns_vfs/video/read_mp4.py +94 -0
ns_vfs/video/read_tlv.py +67 -0
ns_vfs/video/reader.py +48 -0
ns_vfs/vlm/__init__.py +0 -0
ns_vfs/vlm/internvl.py +302 -0
ns_vfs/vlm/internvl_utils.py +210 -0
ns_vfs/vlm/obj.py +31 -0
ns_vfs/vlm/vllm_client.py +168 -0
pyproject.toml +22 -0
scripts/no_nsvs.py +112 -0
scripts/plot.py +252 -0
scripts/plots/plot_complexity.png +3 -0
scripts/plots/plot_duration.png +3 -0
vllm_serve.sh +16 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.mp4 filter=lfs diff=lfs merge=lfs -text
2	+ *.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+vendors/
+output/
+uv.lock
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# VS Code
+.vscode

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM syzygianinfern0/stormbase:latest
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+# Expose Gradio port
+EXPOSE 7860
+# Run your Gradio app
+CMD ["./launch_space.sh"]

Dockerfile.stormbase ADDED Viewed

	@@ -0,0 +1,46 @@

+# Start from a base image with CUDA and Python
+FROM nvidia/cuda:12.8.1-base-ubuntu22.04
+# System setup
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system packages
+RUN apt-get update && apt-get install -y \
+    python3 python3-pip python3-dev python3-venv \
+    git wget unzip cmake build-essential \
+    libboost-all-dev libginac-dev libglpk-dev \
+    m4 libcln-dev libgmp-dev automake libhwloc-dev \
+    libgl1 libglib2.0-0 && \
+    rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements file
+COPY requirements.txt .
+# Upgrade pip and install dependencies
+RUN pip install --upgrade pip && \
+    pip install -r requirements.txt
+# ====== Precompile carl-storm ======
+WORKDIR /opt
+RUN git clone https://github.com/moves-rwth/carl-storm && \
+    cd carl-storm && \
+    mkdir build && cd build && \
+    cmake .. && make lib_carl
+# ====== Precompile Storm ======
+WORKDIR /opt
+RUN wget https://github.com/moves-rwth/storm/archive/stable.zip && \
+    unzip stable.zip && \
+    cd storm-stable && \
+    mkdir build && cd build && \
+    cmake ../ -DCMAKE_BUILD_TYPE=Release \
+    -DSTORM_DEVELOPER=OFF \
+    -DSTORM_LOG_DISABLE_DEBUG=ON \
+    -DSTORM_PORTABLE=ON \
+    -DSTORM_USE_SPOT_SHIPPED=ON && \
+    make -j12
+RUN pip install stormpy

README.md ADDED Viewed

	@@ -0,0 +1,75 @@

+<div align="center">
+# Neuro Symbolic Video Search with Temporal Logic (NSVS-TL)
+[![arXiv](https://img.shields.io/badge/arXiv-2403.11021-b31b1b.svg)](https://arxiv.org/abs/2403.11021) [![Paper](https://img.shields.io/badge/Paper-pdf-green.svg)](https://link.springer.com/chapter/10.1007/978-3-031-73229-4_13) [![Website](https://img.shields.io/badge/ProjectWebpage-nsvs--tl-orange.svg)](https://utaustin-swarmlab.github.io/nsvs/) [![GitHub](https://img.shields.io/badge/Code-Source--Code-blue.svg)](https://github.com/UTAustin-SwarmLab/Neuro-Symbolic-Video-Search-Temporal-Logic) [![GitHub](https://img.shields.io/badge/Code-Dataset-blue.svg)](https://github.com/UTAustin-SwarmLab/Temporal-Logic-Video-Dataset)
+</div>
+## Abstract
+The unprecedented surge in video data production in recent years necessitates efficient tools to extract meaningful frames from videos for downstream tasks. Long-term temporal reasoning is a key desideratum for frame retrieval systems. While state-of-the-art foundation models, like VideoLLaMA and ViCLIP, are proficient in short-term semantic understanding, they surprisingly fail at long-term reasoning across frames. A key reason for this failure is that they intertwine per-frame perception and temporal reasoning into a single deep network. Hence, decoupling but co-designing the semantic understanding and temporal reasoning is essential for efficient scene identification. We propose a system that leverages vision-language models for semantic understanding of individual frames but effectively reasons about the long-term evolution of events using state machines and temporal logic (TL) formulae that inherently capture memory. Our TL-based reasoning improves the F1 score of complex event identification by 9-15% compared to benchmarks that use GPT-4 for reasoning on state-of-the-art self-driving datasets such as Waymo and NuScenes. The source code is available on Github.
+## Installation Guide
+Ensure you have **CUDA 12.4** installed and available on your system.
+On Linux, you can verify with:
+```bash
+nvcc --version
+```
+From the root of the repo, run the following to build all STORM dependencies:
+```bash
+./build_dependency
+```
+Next, install uv:
+```bash
+pip install uv
+```
+Finally, install everything in `pyproject.toml` to build project dependencies:
+```bash
+uv sync
+```
+## Running the System
+NSVS can be run in two ways: running it with raw mp4 files and input queries or running it via the TLV dataset.
+To run it with mp4 files, modify the mp4 file paths and the natural language search query inside `execute_with_mp4.py` and run it with:
+```bash
+uv run execute_with_mp4
+```
+To run it with the TLV dataset, first download the dataset from [GitHub](https://github.com/UTAustin-SwarmLab/Temporal-Logic-Video-Dataset). Then, specify the dataset path in `execute_with_tlv.py` and run the program:
+```bash
+uv run execute_with_tlv
+```
+## Connect with Me
+<p align="center">
+  <em>Feel free to connect with me through these professional channels:</em>
+<p align="center">
+  <a href="https://www.linkedin.com/in/mchoi07/" target="_blank"><img src="https://img.shields.io/badge/-LinkedIn-0077B5?style=flat-square&logo=Linkedin&logoColor=white" alt="LinkedIn"/></a>
+  <a href="mailto:[email protected]"><img src="https://img.shields.io/badge/-Email-D14836?style=flat-square&logo=Gmail&logoColor=white" alt="Email"/></a>
+  <a href="https://scholar.google.com/citations?user=ai4daB8AAAAJ&hl" target="_blank"><img src="https://img.shields.io/badge/-Google%20Scholar-4285F4?style=flat-square&logo=google-scholar&logoColor=white" alt="Google Scholar"/></a>
+  <a href="https://minkyuchoi-07.github.io" target="_blank"><img src="https://img.shields.io/badge/-Website-00C7B7?style=flat-square&logo=Internet-Explorer&logoColor=white" alt="Website"/></a>
+  <a href="https://x.com/MinkyuChoi7" target="_blank"><img src="https://img.shields.io/badge/-Twitter-1DA1F2?style=flat-square&logo=Twitter&logoColor=white" alt="X"/></a>
+</p>
+## Citation
+If you find this repo useful, please cite our paper:
+```bibtex
+@inproceedings{choi2024towards,
+  title={Towards neuro-symbolic video understanding},
+  author={Choi, Minkyu and Goel, Harsh and Omama, Mohammad and Yang, Yunhao and Shah, Sahil and Chinchali, Sandeep},
+  booktitle={European Conference on Computer Vision},
+  pages={220--236},
+  year={2024},
+  organization={Springer}
+}
+```

build_dependency.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env bash
+set -euo pipefail
+REPO_DIR="$(pwd)"
+VENDORS_DIR="$REPO_DIR/vendors"
+INSTALL_PREFIX="$VENDORS_DIR/install"
+mkdir -p "$VENDORS_DIR"
+cd "$VENDORS_DIR"
+# carl-storm
+cd "$VENDORS_DIR"
+if [ ! -d "carl-storm" ]; then
+  git clone https://github.com/moves-rwth/carl-storm
+fi
+cmake -S carl-storm -B carl-storm/build \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_INSTALL_PREFIX="$INSTALL_PREFIX"
+cmake --build carl-storm/build -j"$(nproc)" --target lib_carl
+cmake --build carl-storm/build --target install
+# storm-stable
+if [ ! -d "storm-stable" ]; then
+  git clone --branch stable --depth 1 --recursive https://github.com/moves-rwth/storm.git storm-stable
+fi
+cmake -S storm-stable -B storm-stable/build \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_INSTALL_PREFIX="$INSTALL_PREFIX" \
+  -DSTORM_DEVELOPER=OFF \
+  -DSTORM_LOG_DISABLE_DEBUG=ON \
+  -DSTORM_PORTABLE=ON \
+  -DSTORM_USE_SPOT_SHIPPED=ON
+cmake --build storm-stable/build -j"$(nproc)"
+cmake --build storm-stable/build --target install
+export CMAKE_ARGS="-DCMAKE_POLICY_VERSION_MINIMUM=3.5"
+export STORM_DIR_HINT="$INSTALL_PREFIX"
+export CARL_DIR_HINT="$INSTALL_PREFIX"
+unset CMAKE_ARGS || true

demo_videos/blue_shirt.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afdac2b4ef3d815ccc8beb676bcccfb49245e52e4e8fe298f9fd32e7b2a1651d
+size 3741328

demo_videos/car.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1cfbd984f956ba5dc70f6d76009d65f5fd9ecc26686233cff3cc4860c880025
+size 7408741

demo_videos/dog_jump.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ddb11ae2c86cef81311b3c753e30447828fce6086025b01ba2893621365739b
+size 5096729

demo_videos/teaser-gen3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af319b324939c6a03cae461ca9da1e28fa550cc54ef9ac3a12d9525a1f309e48
+size 4421653

demo_videos/teaser-pika.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:518ccf98b32c4fbdcc49ca686c4c42d1f8f632a835c72da1dc9104f08d8439f5
+size 1560341

execute_demo.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import json
+import os
+import uuid
+import cv2
+import subprocess
+import numpy as np
+import gradio as gr
+import tempfile
+from typing import Dict, List, Iterable, Tuple
+from ns_vfs.video.read_mp4 import Mp4Reader
+from execute_with_mp4 import process_entry
+def _load_entry_from_reader(video_path, query_text):
+    reader = Mp4Reader(
+        [{"path": video_path, "query": query_text}],
+        openai_save_path="",
+        sampling_rate_fps=0.5
+    )
+    data = reader.read_video()
+    if not data:
+        raise RuntimeError("No data returned by Mp4Reader (check video path)")
+    return data[0]
+def _make_empty_video(path, width=320, height=240, fps=1.0):
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(path, fourcc, fps, (width, height))
+    frame = np.zeros((height, width, 3), dtype=np.uint8)
+    writer.write(frame)
+    writer.release()
+    return path
+def _crop_video_ffmpeg(input_path, output_path, frame_indices, prop_matrix):
+    if len(frame_indices) == 0:
+        cap = cv2.VideoCapture(str(input_path))
+        if not cap.isOpened():
+            raise RuntimeError(f"Could not open video: {input_path}")
+        width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        cap.release()
+        _make_empty_video(output_path, width, height, fps=1.0)
+        return
+    def group_into_ranges(frames):
+        if not frames:
+            return []
+        frames = sorted(set(frames))
+        ranges = []
+        start = prev = frames[0]
+        for f in frames[1:]:
+            if f == prev + 1:
+                prev = f
+            else:
+                ranges.append((start, prev + 1))  # end-exclusive
+                start = prev = f
+        ranges.append((start, prev + 1))
+        return ranges
+    ranges = group_into_ranges(frame_indices)
+    filters = []
+    labels = []
+    for i, (start, end) in enumerate(ranges):
+        filters.append(
+            f"[0:v]trim=start_frame={start}:end_frame={end},setpts=PTS-STARTPTS[v{i}]"
+        )
+        labels.append(f"[v{i}]")
+    filters.append(f"{''.join(labels)}concat=n={len(ranges)}:v=1:a=0[outv]")
+    cmd = [
+        "ffmpeg", "-y", "-i", input_path,
+        "-filter_complex", "; ".join(filters),
+        "-map", "[outv]",
+        "-c:v", "libx264", "-preset", "fast", "-crf", "23",
+        output_path,
+    ]
+    subprocess.run(cmd, check=True)
+def _crop_video(input_path: str, output_path: str, frame_indices: List[int], prop_matrix: Dict[str, List[int]]):
+    input_path = str(input_path)
+    output_path = str(output_path)
+    # Probe width/height/fps
+    cap = cv2.VideoCapture(input_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Could not open video: {input_path}")
+    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps    = float(cap.get(cv2.CAP_PROP_FPS)) or 0.0
+    cap.release()
+    if fps <= 0:
+        fps = 30.0
+    # If nothing to write, emit a 1-frame empty video
+    if not frame_indices:
+        from numpy import zeros, uint8
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, 1.0, (width, height))
+        out.write(zeros((height, width, 3), dtype=uint8))
+        out.release()
+        return
+    # Helper: group consecutive integers into (start, end_exclusive)
+    def _group_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
+        f = sorted(set(int(x) for x in frames))
+        if not f:
+            return []
+        out = []
+        s = p = f[0]
+        for x in f[1:]:
+            if x == p + 1:
+                p = x
+            else:
+                out.append((s, p + 1))
+                s = p = x
+        out.append((s, p + 1))
+        return out
+    # Invert prop_matrix to {frame_idx: sorted [props]}
+    props_by_frame: Dict[int, List[str]] = {}
+    for prop, frames in (prop_matrix or {}).items():
+        for fi in frames:
+            fi = int(fi)
+            props_by_frame.setdefault(fi, []).append(prop)
+    for fi in list(props_by_frame.keys()):
+        props_by_frame[fi] = sorted(set(props_by_frame[fi]))
+    # Only subtitle frames we will output
+    fi_set = set(int(x) for x in frame_indices)
+    frames_with_labels = sorted(fi for fi in fi_set if props_by_frame.get(fi))
+    # Compress consecutive frames that share the same label set
+    grouped_label_spans: List[Tuple[int, int, Tuple[str, ...]]] = []
+    prev_f = None
+    prev_labels: Tuple[str, ...] = ()
+    span_start = None
+    for f in frames_with_labels:
+        labels = tuple(props_by_frame.get(f, []))
+        if prev_f is None:
+            span_start, prev_f, prev_labels = f, f, labels
+        elif (f == prev_f + 1) and (labels == prev_labels):
+            prev_f = f
+        else:
+            grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
+            span_start, prev_f, prev_labels = f, f, labels
+    if prev_f is not None and prev_labels:
+        grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
+    # Build ASS subtitle file (top-right)
+    def ass_time(t_sec: float) -> str:
+        cs = int(round(t_sec * 100))
+        h = cs // (100 * 3600)
+        m = (cs // (100 * 60)) % 60
+        s = (cs // 100) % 60
+        cs = cs % 100
+        return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
+    def make_ass(width: int, height: int) -> str:
+        lines = []
+        lines.append("[Script Info]")
+        lines.append("ScriptType: v4.00+")
+        lines.append("ScaledBorderAndShadow: yes")
+        lines.append(f"PlayResX: {width}")
+        lines.append(f"PlayResY: {height}")
+        lines.append("")
+        lines.append("[V4+ Styles]")
+        lines.append("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
+                     "Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, "
+                     "Shadow, Alignment, MarginL, MarginR, MarginV, Encoding")
+        # Font size 18 per your request; Alignment=9 (top-right)
+        lines.append("Style: Default,DejaVu Sans,18,&H00FFFFFF,&H000000FF,&H00000000,&H64000000,"
+                     "0,0,0,0,100,100,0,0,1,2,0.8,9,16,16,16,1")
+        lines.append("")
+        lines.append("[Events]")
+        lines.append("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text")
+        for start_f, end_f, labels in grouped_label_spans:
+            if not labels:
+                continue
+            start_t = ass_time(start_f / fps)
+            end_t   = ass_time(end_f   / fps)
+            text = r"\N".join(labels)  # stacked lines
+            lines.append(f"Dialogue: 0,{start_t},{end_t},Default,,0,0,0,,{text}")
+        return "\n".join(lines)
+    tmp_dir = tempfile.mkdtemp(prefix="props_ass_")
+    ass_path = os.path.join(tmp_dir, "props.ass")
+    with open(ass_path, "w", encoding="utf-8") as f:
+        f.write(make_ass(width, height))
+    # Build trim/concat ranges from requested frame_indices
+    ranges = _group_ranges(frame_indices)
+    # Filtergraph with burned subtitles then trim/concat
+    split_labels = [f"[s{i}]" for i in range(len(ranges))] if ranges else []
+    out_labels   = [f"[v{i}]" for i in range(len(ranges))] if ranges else []
+    filters = []
+    ass_arg = ass_path.replace("\\", "\\\\")
+    filters.append(f"[0:v]subtitles='{ass_arg}'[sub]")
+    if len(ranges) == 1:
+        s0, e0 = ranges[0]
+        filters.append(f"[sub]trim=start_frame={s0}:end_frame={e0},setpts=PTS-STARTPTS[v0]")
+    else:
+        if ranges:
+            filters.append(f"[sub]split={len(ranges)}{''.join(split_labels)}")
+            for i, (s, e) in enumerate(ranges):
+                filters.append(f"{split_labels[i]}trim=start_frame={s}:end_frame={e},setpts=PTS-STARTPTS{out_labels[i]}")
+    if ranges:
+        filters.append(f"{''.join(out_labels)}concat=n={len(ranges)}:v=1:a=0[outv]")
+    filter_complex = "; ".join(filters)
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", input_path,
+        "-filter_complex", filter_complex,
+        "-map", "[outv]" if ranges else "[sub]",
+        "-c:v", "libx264", "-preset", "fast", "-crf", "23",
+        output_path,
+    ]
+    try:
+        subprocess.run(cmd, check=True)
+    finally:
+        try:
+            os.remove(ass_path)
+            os.rmdir(tmp_dir)
+        except OSError:
+            pass
+def _format_prop_ranges(prop_matrix: Dict[str, List[int]]) -> str:
+    def group_into_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
+        f = sorted(set(int(x) for x in frames))
+        if not f:
+            return []
+        ranges: List[Tuple[int, int]] = []
+        s = p = f[0]
+        for x in f[1:]:
+            if x == p + 1:
+                p = x
+            else:
+                ranges.append((s, p))   # inclusive end for display
+                s = p = x
+        ranges.append((s, p))
+        return ranges
+    if not prop_matrix:
+        return "No propositions detected."
+    lines = []
+    for prop, frames in prop_matrix.items():
+        ranges = group_into_ranges(frames)
+        pretty = prop.replace("_", " ").title()
+        if not ranges:
+            lines.append(f"{pretty}: —")
+            continue
+        parts = [f"{a}" if a == b else f"{a}-{b}" for (a, b) in ranges]
+        lines.append(f"{pretty}: {', '.join(parts)}")
+    return "\n".join(lines)
+# -----------------------------
+# Gradio handler
+# -----------------------------
+def run_pipeline(input_video, mode, query_text, propositions_json, specification_text):
+    """
+    Returns: (cropped_video_path, prop_ranges_text, tl_text)
+    """
+    def _err(msg, width=320, height=240):  # keep outputs shape consistent
+        tmp_out = os.path.join("/tmp", f"empty_{uuid.uuid4().hex}.mp4")
+        _make_empty_video(tmp_out, width=width, height=height, fps=1.0)
+        return (
+            tmp_out,
+            "No propositions detected.",
+            f"Error: {msg}"
+        )
+    # Resolve video path
+    if isinstance(input_video, dict) and "name" in input_video:
+        video_path = input_video["name"]
+    elif isinstance(input_video, str):
+        video_path = input_video
+    else:
+        return _err("Please provide a video.")
+    # Build entry
+    if mode == "Natural language query":
+        if not query_text or not query_text.strip():
+            return _err("Please enter a query.")
+        entry = _load_entry_from_reader(video_path, query_text)
+    else:
+        if not (propositions_json and propositions_json.strip()) or not (specification_text and specification_text.strip()):
+            return _err("Please provide both Propositions (array) and Specification.")
+        entry = _load_entry_from_reader(video_path, "dummy-query")
+        try:
+            props = json.loads(propositions_json)
+            if not isinstance(props, list):
+                return _err("Propositions must be a JSON array.")
+        except Exception as e:
+            return _err(f"Failed to parse propositions JSON: {e}")
+        entry["tl"] = {
+            "propositions": props,
+            "specification": specification_text
+        }
+    # Compute FOI
+    try:
+        foi, prop_matrix = process_entry(entry)  # list of frame indices & {prop: [frames]}
+        print(foi)
+        print(prop_matrix)
+    except Exception as e:
+        return _err(f"Processing error: {e}")
+    # Write cropped video
+    try:
+        out_path = os.path.join("/tmp", f"cropped_{uuid.uuid4().hex}.mp4")
+        _crop_video(video_path, out_path, foi, prop_matrix)
+        print(f"Wrote cropped video to: {out_path}")
+    except Exception as e:
+        return _err(f"Failed to write cropped video: {e}")
+    # Build right-side text sections
+    prop_ranges_text = _format_prop_ranges(prop_matrix)
+    tl_text = (
+        f"Propositions: {json.dumps(entry['tl']['propositions'], ensure_ascii=False)}\n"
+        f"Specification: {entry['tl']['specification']}"
+    )
+    return out_path, prop_ranges_text, tl_text
+# -----------------------------
+# UI
+# -----------------------------
+with gr.Blocks(css="""
+#io-col {display: flex; gap: 1rem;}
+#left {flex: 1;}
+#right {flex: 1;}
+""", title="NSVS-TL") as demo:
+    gr.Markdown("# Neuro-Symbolic Visual Search with Temporal Logic")
+    gr.Markdown(
+        "Upload a video and either provide a natural-language **Query** *or* directly supply **Propositions** (array) + **Specification**. "
+        "On the right, you'll get a **cropped video** containing only the frames of interest, a **Propositions by Frames** summary, and the combined TL summary."
+    )
+    with gr.Row(elem_id="io-col"):
+        with gr.Column(elem_id="left"):
+            mode = gr.Radio(
+                choices=["Natural language query", "Props/Spec"],
+                value="Natural language query",
+                label="Input mode"
+            )
+            video = gr.Video(label="Upload Video")
+            query = gr.Textbox(
+                label="Query (natural language)",
+                placeholder="e.g., a man is jumping and panting until he falls down"
+            )
+            propositions = gr.Textbox(
+                label="Propositions (JSON array)",
+                placeholder='e.g., ["man_jumps", "man_pants", "man_falls_down"]',
+                lines=4,
+                visible=False
+            )
+            specification = gr.Textbox(
+                label="Specification",
+                placeholder='e.g., ("woman_jumps" & "woman_claps") U "candle_is_blown"',
+                visible=False
+            )
+            def _toggle_fields(m):
+                if m == "Natural language query":
+                    return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+                else:
+                    return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
+            mode.change(_toggle_fields, inputs=[mode], outputs=[query, propositions, specification])
+            run_btn = gr.Button("Run", variant="primary")
+            gr.Examples(
+                label="Examples (dummy paths + queries)",
+                examples=[
+                    ["demo_videos/dog_jump.mp4", "a dog jumps until a red tube is in view"],
+                    ["demo_videos/blue_shirt.mp4", "a girl in a green shirt until a candle is blown"],
+                    ["demo_videos/car.mp4", "red car until a truck"]
+                ],
+                inputs=[video, query],
+                cache_examples=False
+            )
+        with gr.Column(elem_id="right"):
+            cropped_video = gr.Video(label="Cropped Video (Frames of Interest Only)")
+            prop_ranges_out = gr.Textbox(
+                label="Propositions by Frames",
+                lines=6,
+                interactive=False
+            )
+            tl_out = gr.Textbox(
+                label="TL (Propositions & Specification)",
+                lines=8,
+                interactive=False
+            )
+    run_btn.click(
+        fn=run_pipeline,
+        inputs=[video, mode, query, propositions, specification],
+        outputs=[cropped_video, prop_ranges_out, tl_out]
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

execute_with_mp4.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from tqdm import tqdm
+import itertools
+import operator
+import json
+import time
+import os
+from ns_vfs.nsvs import run_nsvs
+from ns_vfs.video.read_mp4 import Mp4Reader
+VIDEOS = [
+    {
+        "path": "demo_videos/blue_shirt.mp4",
+        "query": "a woman is jumping and clapping until a candle is blown"
+    }
+]
+DEVICE = 7  # GPU device index
+OPENAI_SAVE_PATH = ""
+OUTPUT_DIR = "output"
+def fill_in_frame_count(arr, entry):
+    scale = (entry["video_info"].fps) / (entry["metadata"]["sampling_rate_fps"])
+    runs = []
+    for _, grp in itertools.groupby(sorted(arr), key=lambda x, c=[0]: (x - (c.__setitem__(0, c[0]+1) or c[0]))):
+        g = list(grp)
+        runs.append((g[0], g[-1]))
+    real = []
+    for start_i, end_i in runs:
+        a = int(round(start_i * scale))
+        b = int(round(end_i * scale))
+        if real and a <= real[-1]:
+            a = real[-1] + 1
+        real.extend(range(a, b + 1))
+    return real
+def process_entry(entry):
+    foi, object_frame_dict = run_nsvs(
+        frames=entry['images'],
+        proposition=entry['tl']['propositions'],
+        specification=entry['tl']['specification'],
+        model_name="InternVL2-8B",
+        device=DEVICE
+    )
+    foi = fill_in_frame_count([i for sub in foi for i in sub], entry)
+    object_frame_dict = {key: fill_in_frame_count(value, entry) for key, value in object_frame_dict.items()}
+    return foi, object_frame_dict
+def main():
+    reader = Mp4Reader(VIDEOS, OPENAI_SAVE_PATH, sampling_rate_fps=1)
+    data = reader.read_video()
+    if not data:
+        return
+    with tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
+        for i, entry in pbar:
+            start_time = time.time()
+            foi = process_entry(entry)
+            end_time = time.time()
+            processing_time = round(end_time - start_time, 3)
+            if foi:
+                output = {
+                    "tl": entry["tl"],
+                    "metadata": entry["metadata"],
+                    "video_info": entry["video_info"].to_dict(),
+                    "frames_of_interest": foi,
+                    "processting_time_seconds": processing_time
+                }
+                os.makedirs(OUTPUT_DIR, exist_ok=True)
+                with open(os.path.join(OUTPUT_DIR, f"output_{i}.json"), "w") as f:
+                    json.dump(output, f, indent=4)
+if __name__ == "__main__":
+    main()

execute_with_tlv.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from tqdm import tqdm
+import json
+import time
+import os
+from ns_vfs.nsvs import run_nsvs
+from ns_vfs.video.read_tlv import TLVReader
+TLV_PATH = "/nas/dataset/tlv-dataset-v1"
+DEVICE = 7  # GPU device index
+OUTPUT_DIR = "output"
+def process_entry(entry):
+    foi = run_nsvs(
+        frames=entry['images'],
+        proposition=entry['tl']['propositions'],
+        specification=entry['tl']['specification'],
+        model_name="InternVL2-8B",
+        device=DEVICE
+    )
+    return foi
+def main():
+    reader = TLVReader(TLV_PATH)
+    data = reader.read_video()
+    if not data:
+        return
+    with tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
+        for i, entry in pbar:
+            start_time = time.time()
+            foi = process_entry(entry)
+            end_time = time.time()
+            processing_time = round(end_time - start_time, 3)
+            if foi:
+                output = {
+                    "tl": entry["tl"],
+                    "metadata": entry["metadata"],
+                    "video_info": entry["video_info"].to_dict(),
+                    "frames_of_interest": foi,
+                    "processting_time_seconds": processing_time
+                }
+                os.makedirs(OUTPUT_DIR, exist_ok=True)
+                with open(os.path.join(OUTPUT_DIR, f"output_{i}.json"), "w") as f:
+                    json.dump(output, f, indent=4)
+if __name__ == "__main__":
+    main()

launch_space.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/bin/bash
+apt update
+apt install -y ffmpeg
+# Start vLLM server in background
+./vllm_serve.sh &
+# Wait briefly to ensure vLLM is up before Gradio tries to connect
+sleep 60
+# Display fancy startup message
+echo "
+╔════════════════════════════════════════════════════════════════╗
+║                                                                ║
+║                   🚀 Gradio Space Starting! 🚀                 ║
+║                                                                ║
+╚════════════════════════════════════════════════════════════════╝
+"
+# Start Gradio app
+python3 execute_demo.py

ns_vfs/model_checker/__init__.py ADDED Viewed

File without changes

ns_vfs/model_checker/frame_validator.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import re
+import enum
+from ns_vfs.video.frame import VideoFrame
+class SymbolicFilterRule(enum.Enum):
+    AVOID_PROPS = "avoid"
+    ASSOCIATED_PROPS = "associated"
+class FrameValidator:
+    def __init__(
+        self,
+        ltl_formula: str,
+        threshold_of_probability: float = 0.5,
+    ):
+        self.threshold_of_probability = threshold_of_probability
+        ltl_formula = ltl_formula[ltl_formula.find('[') + 1:ltl_formula.rfind(']')]
+        if " U " in ltl_formula:
+            rule_1 = self.get_symbolic_rule_from_ltl_formula(ltl_formula.split(" U ")[0])
+            rule_2 = self.get_symbolic_rule_from_ltl_formula(ltl_formula.split(" U ")[1])
+            self.symbolic_verification_rule = {
+                SymbolicFilterRule.ASSOCIATED_PROPS: rule_1[SymbolicFilterRule.ASSOCIATED_PROPS] + rule_2[SymbolicFilterRule.ASSOCIATED_PROPS],
+                SymbolicFilterRule.AVOID_PROPS: rule_1[SymbolicFilterRule.AVOID_PROPS] or rule_2[SymbolicFilterRule.AVOID_PROPS],
+            }
+        else:
+            self.symbolic_verification_rule = self.get_symbolic_rule_from_ltl_formula(ltl_formula)
+    def validate_frame(
+        self,
+        frame: VideoFrame,
+    ):
+        """Validate frame."""
+        thresholded_objects = frame.thresholded_detected_objects(self.threshold_of_probability)
+        if len(thresholded_objects) > 0:
+            return self.symbolic_verification(frame)
+        else:
+            return False
+    def symbolic_verification(self, frame: VideoFrame):
+        """Symbolic verification."""
+        avoid_props = self.symbolic_verification_rule.get(SymbolicFilterRule.AVOID_PROPS)
+        if avoid_props:
+            for prop in frame.object_of_interest.keys():
+                if frame.object_of_interest[prop].get_detected_probability() >= self.threshold_of_probability and prop in avoid_props: # detected but also in avoid_props
+                    return False
+        associated_props = self.symbolic_verification_rule.get(SymbolicFilterRule.ASSOCIATED_PROPS)
+        for group in associated_props:
+            bad = 0
+            total = 0
+            for prop in group:
+                total += 1
+                if frame.object_of_interest[prop].get_detected_probability() < self.threshold_of_probability:
+                    bad += 1
+            if total > 2 * bad:
+                return True
+        return False
+    def get_symbolic_rule_from_ltl_formula(self, ltl_formula: str) -> dict:
+        symbolic_verification_rule = {}
+        if "!" in ltl_formula:
+            match = re.search(r'(?<!\w)!\s*(?:\((.*?)\)|([^\s\)]+))', ltl_formula)
+            avoid_tl = (match.group(1) or match.group(2)).strip()
+            symbolic_verification_rule[SymbolicFilterRule.AVOID_PROPS] = avoid_tl
+        else:
+            symbolic_verification_rule[SymbolicFilterRule.AVOID_PROPS] = None
+        ltl_formula = re.sub(r"[!GF]", "", ltl_formula.strip())
+        while ltl_formula.startswith("(") and ltl_formula.endswith(")") and ltl_formula.count("(") == ltl_formula.count(")"):
+            ltl_formula = ltl_formula[1:-1].strip()
+        split_and_clean = lambda expr: [re.sub(r"[()]", "", p).strip() for p in re.split(r"\s*&\s*", expr) if p.strip()]
+        match = re.search(r'\b( U |F)\b', ltl_formula)
+        if match:
+            idx = match.start()
+            associated = [split_and_clean(ltl_formula[:idx]), split_and_clean(ltl_formula[idx + len(match.group(1)):])]
+        else:
+            associated = [split_and_clean(ltl_formula)]
+        associated = [[s.strip('"') for s in sublist] for sublist in associated]
+        symbolic_verification_rule[SymbolicFilterRule.ASSOCIATED_PROPS] = associated
+        return symbolic_verification_rule

ns_vfs/model_checker/property_checker.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from ns_vfs.model_checker.stormpy import StormModelChecker
+from ns_vfs.model_checker.frame_validator import FrameValidator
+class PropertyChecker:
+    def __init__(self, proposition, specification, model_type, tl_satisfaction_threshold, detection_threshold):
+        self.proposition = proposition
+        self.tl_satisfaction_threshold = tl_satisfaction_threshold
+        self.specification = self.generate_specification(specification)
+        self.model_type = model_type
+        self.detection_threshold = detection_threshold
+        self.model_checker = StormModelChecker(
+            proposition_set=self.proposition,
+            ltl_formula=self.specification
+        )
+        self.frame_validator = FrameValidator(
+            ltl_formula=self.specification,
+            threshold_of_probability=self.detection_threshold
+        )
+    def generate_specification(self, specification_raw):
+        return f'P>={self.tl_satisfaction_threshold:.2f} [ {specification_raw} ]'
+    def validate_frame(self, frame_of_interest):
+        return self.frame_validator.validate_frame(frame_of_interest)
+    def check_automaton(self, automaton):
+        return self.model_checker.check_automaton(
+            transitions=automaton.transitions,
+            states=automaton.states,
+            model_type=self.model_type
+        )
+    def validate_tl_specification(self, specification):
+        return self.model_checker.validate_tl_specification(specification)

ns_vfs/model_checker/stormpy.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import logging
+import math
+import numpy as np
+import stormpy
+import stormpy.examples.files
+from stormpy import ExplicitQualitativeCheckResult
+from ns_vfs.model_checker.video_state import VideoState
+class StormModelChecker:
+    """Model Checker using Stormpy for verifying properties."""
+    def __init__(
+        self,
+        proposition_set: list[str],
+        ltl_formula: str,
+    ) -> None:
+        """Initialize the StormModelChecker.
+        Args:
+            proposition_set: List of propositions.
+            ltl_formula: LTL formula to check.
+            verbose: Enable verbose output.
+            is_filter: Apply filtering to results.
+        """
+        self.proposition_set = proposition_set
+        self.ltl_formula = ltl_formula
+    def create_model(
+        self,
+        transitions: list[tuple[int, int, float]],
+        states: list[VideoState],
+        model_type: str = "sparse_ma",
+    ) -> any:
+        """Create model.
+        Args:
+            transitions (list[tuple[int, int, float]]): List of transitions.
+            states (list[VideoState]): List of states.
+            model_type (str): Type of model to create ("sparse_ma" or "dtmc").
+            verbose (bool): Whether to print verbose output.
+        """
+        state_labeling = self._build_label_func(states, self.proposition_set)
+        if model_type in ["sparse_ma", "mdp"]:
+            transition_matrix = self._build_trans_matrix(
+                transitions=transitions,
+                states=states,
+                model_type="nondeterministic",
+            )
+        else:
+            transition_matrix = self._build_trans_matrix(
+                transitions=transitions,
+                states=states,
+                model_type="deterministic",
+            )
+        components = stormpy.SparseModelComponents(
+            transition_matrix=transition_matrix,
+            state_labeling=state_labeling,
+        )
+        if model_type == "sparse_ma":
+            markovian_states = stormpy.BitVector(len(states), list(range(len(states))))
+            components.markovian_states = markovian_states
+            components.exit_rates = [1.0 for _ in range(len(states))]
+            model = stormpy.SparseMA(components)
+        elif model_type == "dtmc":
+            model = stormpy.storage.SparseDtmc(components)
+        elif model_type == "mdp":
+            model = stormpy.storage.SparseMdp(components)
+        else:
+            msg = f"Unsupported model type: {model_type}"
+            raise ValueError(msg)
+        return model
+    def check_automaton(
+        self,
+        transitions: list[tuple[int, int, float]],
+        states: list[VideoState],
+        model_type: str = "sparse_ma"
+    ) -> any:
+        """Check automaton.
+        Args:
+            transitions: List of transitions.
+            states: List of states.
+            verbose: Enable verbose output.
+            use_filter: Apply filtering to results.
+        """
+        model = self.create_model(
+            transitions=transitions,
+            states=states,
+            model_type=model_type,
+        )
+        # Define Properties
+        properties = stormpy.parse_properties_without_context(self.ltl_formula,)
+        # Get Result and Filter it
+        result = stormpy.model_checking(model, properties[0])
+        return self.qualitative_result_eval(result)
+    def qualitative_result_eval(self, verification_result: ExplicitQualitativeCheckResult) -> bool:
+        if isinstance(verification_result, ExplicitQualitativeCheckResult):
+            # string result is "true" when is absolutely true
+            # but it returns "true, false" when we have some true and false
+            verification_result_str = str(verification_result)
+            string_result = verification_result_str.split("{")[-1].split("}")[0]
+            if len(string_result) == 4:
+                if string_result[0] == "t":  # 0,6
+                    result = True
+            elif len(string_result) > 5:
+                # "true, false" -> some true and some false
+                result = True
+            else:
+                result = False
+            return result
+        msg = "Model Checking is not qualitative"
+        raise ValueError(msg)
+    def _build_trans_matrix(
+        self,
+        transitions: list[tuple[int, int, float]],
+        states: list[VideoState],
+        model_type: str = "nondeterministic",
+    ) -> stormpy.storage.SparseMatrix:
+        """Build transition matrix.
+        Args:
+            transitions: List of transitions.
+            states: List of states.
+            model_type: Type of model ("nondeterministic" or "deterministic").
+        """
+        if model_type not in ["nondeterministic", "deterministic"]:
+            msg = "Invalid model_type. Must be 'nondeterministic' or 'deterministic'"
+            raise ValueError(msg)
+        if model_type == "nondeterministic":
+            matrix = np.zeros((len(states), len(states)))
+            for t in transitions:
+                matrix[int(t[0]), int(t[1])] = float(t[2])
+            trans_matrix = stormpy.build_sparse_matrix(matrix, list(range(len(states))))
+        elif model_type == "deterministic":
+            num_states = len(states)
+            builder = stormpy.SparseMatrixBuilder(
+                rows=num_states,
+                columns=num_states,
+                entries=len(transitions),
+                force_dimensions=False,
+            )
+            states_with_transitions = set(src for src, _, _ in transitions)
+            outgoing_probs = {i: 0.0 for i in range(num_states)}
+            for src, dest, prob in transitions:
+                builder.add_next_value(src, dest, prob)
+                outgoing_probs[src] += prob
+            for state in range(num_states):
+                if state not in states_with_transitions:
+                    builder.add_next_value(state, state, 1.0)
+                    outgoing_probs[state] = 1.0
+            # Check probabilities
+            for state, prob_sum in outgoing_probs.items():
+                # if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
+                if not math.isclose(prob_sum, 1.0, abs_tol=1e-2):
+                    logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
+            # ... (existing logging code) ...
+            trans_matrix = builder.build()
+        return trans_matrix
+    def _build_label_func(
+        self,
+        states: list[VideoState],
+        props: list[str],
+        model_type: str = "nondeterministic",
+    ) -> stormpy.storage.StateLabeling:
+        """Build label function.
+        Args:
+            states (list[State]): List of states.
+            props (list[str]): List of propositions.
+            model_type (str): Type of model
+                ("nondeterministic" or "deterministic").
+        Returns:
+            stormpy.storage.StateLabeling: State labeling.
+        """
+        state_labeling = stormpy.storage.StateLabeling(len(states))
+        state_labeling.add_label("init")
+        state_labeling.add_label("terminal")
+        for label in props:
+            state_labeling.add_label(label)
+        if model_type == "nondeterministic":
+            for state in states:
+                for label in state.descriptive_label:
+                    state_labeling.add_label_to_state(label, state.state_index)
+        else:
+            for i, state in enumerate(states):
+                for prop in state.props:
+                    if prop in props:
+                        state_labeling.add_label_to_state(prop, i)
+        return state_labeling
+    def validate_tl_specification(self, ltl_formula: str) -> bool:
+        """Validate LTL specification.
+        Args:
+            ltl_formula: LTL formula to validate.
+        """
+        path = stormpy.examples.files.prism_dtmc_die  #  prism_mdp_maze
+        prism_program = stormpy.parse_prism_program(path)
+        # Define Properties
+        try:
+            stormpy.parse_properties(ltl_formula, prism_program)
+        except Exception as e:
+            msg = f"Error validating LTL specification: {e}"
+            logging.exception(msg)
+            return False
+        else:
+            return True

ns_vfs/model_checker/video_automaton.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from ns_vfs.model_checker.video_state import VideoState
+from ns_vfs.video.frame import VideoFrame
+class VideoAutomaton:
+    """Represents a Markov Automaton for video state modeling."""
+    def __init__(self, include_initial_state: bool = False) -> None:
+        """Initialize the MarkovAutomaton.
+        Args:
+            include_initial_state (bool, optional): Whether to include
+                the initial state. Defaults to False.
+            proposition_set (list[str] | None, optional): List of propositions.
+                Defaults to None.
+        """
+        self.previous_states: list[VideoState] = []
+        self.states: list[VideoState] = []
+        self.transitions = []
+        self.include_initial_state = include_initial_state
+    def set_up(self, proposition_set: list[str]) -> None:
+        """Set up the MarkovAutomaton."""
+        self.proposition_set = proposition_set
+        self.label_combinations = self._create_label_combinations(len(proposition_set))
+        self.probability_of_propositions = [[] for _ in range(len(proposition_set))]
+        self.frame_index_in_automaton = 0
+        if self.include_initial_state:
+            initial_state = VideoState(
+                state_index=0,
+                frame_index=-1,
+                label="init",
+                proposition_set=proposition_set,
+            )
+            self.previous_states = [initial_state]
+            self.states = [initial_state]
+            self._current_state = initial_state
+    def reset(self) -> None:
+        """Reset automaton."""
+        self.__init__(self.include_initial_state)
+        self.set_up(self.proposition_set)
+    def add_frame(self, frame: VideoFrame) -> None:
+        """Add frame to automaton."""
+        self._get_probability_of_propositions(frame)
+        current_states = []
+        for prop_comb in self.label_combinations:
+            # iterate through all possible combinations of T and F
+            self._current_state = VideoState(
+                state_index=len(self.states),
+                frame_index=self.frame_index_in_automaton,
+                label=prop_comb,
+                proposition_set=self.proposition_set,
+            )
+            # TODO: Make a method for update and compute probability
+            self._current_state.update(
+                frame_index=self.frame_index_in_automaton,
+                target_label=prop_comb,
+            )
+            self._current_state.compute_probability(probabilities=self.probability_of_propositions)
+            if self._current_state.probability > 0:
+                self.states.append(self._current_state)
+                current_states.append(self._current_state)
+        # Build transitions from previous states to current states
+        if self.previous_states:
+            for prev_state in self.previous_states:
+                for cur_state in current_states:
+                    transition = (
+                        prev_state.state_index,
+                        cur_state.state_index,
+                        cur_state.probability,
+                    )
+                    self.transitions.append(transition)
+        self.previous_states = current_states if current_states else self.previous_states
+        self.frame_index_in_automaton += 1
+    def add_terminal_state(self, add_with_terminal_label: bool = False) -> None:
+        """Add terminal state to the automaton."""
+        if add_with_terminal_label:
+            terminal_state_index = len(self.states)
+            terminal_state = VideoState(
+                state_index=terminal_state_index,
+                frame_index=self.frame_index_in_automaton,
+                label="terminal",
+                proposition_set=self.proposition_set,
+            )
+            self.states.append(terminal_state)
+            self._current_state = terminal_state
+            self.transitions.extend(
+                (prev_state.state_index, terminal_state_index, 1.0) for prev_state in self.previous_states
+            )
+            self.transitions.append((terminal_state_index, terminal_state_index, 1.0))
+        else:
+            self.transitions.extend(
+                (prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
+            )
+    def get_frame_to_state_index(self) -> dict[int, list[int]]:
+        """Get frame to state index mapping."""
+        data = {}
+        for state in self.states:
+            if state.frame_index not in data:
+                data[state.frame_index] = []
+            data[state.frame_index].append(state.state_index)
+        return data
+    def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
+        """Update the probability of propositions."""
+        for i, prop in enumerate(self.proposition_set):
+            if frame.object_of_interest.get(prop):
+                probability = frame.object_of_interest[prop].get_detected_probability()
+            else:
+                prop = prop.replace("_", " ")
+                if frame.object_of_interest.get(prop):
+                    probability = frame.object_of_interest[prop].get_detected_probability()
+                else:
+                    probability = 0
+            self.probability_of_propositions[i].append(round(float(probability), 2))
+    def _create_label_combinations(self, num_props: int) -> list[str]:
+        """Create all possible combinations of T and F for the number of propositions.
+        Args:
+            num_props (int): Number of propositions.
+        Returns:
+            list[str]: List of all possible combinations of T and F.
+        """
+        label_list = []
+        def add_labels(num_props: int, label: str, label_list: list[str]) -> None:
+            if len(label) == num_props:
+                label_list.append(label)
+                return
+            add_labels(num_props, label + "T", label_list)
+            add_labels(num_props, label + "F", label_list)
+        add_labels(num_props, "", label_list)
+        return label_list

ns_vfs/model_checker/video_state.py ADDED Viewed

	@@ -0,0 +1,80 @@

+class VideoState:
+    """Video state class."""
+    def __init__(
+        self,
+        state_index: int,
+        frame_index: int,
+        label: str,
+        proposition_set: list[str],
+        probability: float = 1.0,
+    ) -> None:
+        """State class.
+        Args:
+            state_index (int): state_index.
+            frame_index (int): Frame index.
+            label (str): Label set. :abel is a string with characters T or F
+                indicating True or False
+            proposition_set (list[str]): Proposition set.
+            probability (float): Probability of the state.
+        """
+        self.state_index = state_index
+        self.frame_index = frame_index
+        self.proposition_set = proposition_set
+        self.label = label  # "init", "terminal", TTT, TFT, FTT, etc.
+        self.descriptive_label = self._get_descriptive_label(label=label)
+        self.probability = probability
+    def __repr__(self) -> str:
+        """Representation of state."""
+        return f"{self.frame_index}|{self.state_index} ({self.probability}): {self.label}"
+    def __str__(self) -> str:
+        """String of state."""
+        return f"{self.__repr__()}"
+    def _get_descriptive_label(self, label: str) -> list:
+        """Get descriptive label.
+        Args:
+        label (str): Label.
+        """
+        labels = []
+        if label == "init":
+            labels.append("init")
+        elif label == "terminal":
+            labels.append("terminal")
+        else:
+            for i in range(len(self.proposition_set)):
+                if label[i] == "T":
+                    labels.append(self.proposition_set[i])
+        return labels
+    def update(self, frame_index: int, target_label: str) -> None:
+        """Update state to the new state..
+        Args:
+            frame_index (int): Frame index.
+            target_label (str): Target label for the new state.
+        """
+        self.frame_index = frame_index
+        self.label = target_label  # TTT, TFT, FTT, etc.
+        self.descriptive_label = self._get_descriptive_label(label=target_label)
+        self.probability = 1.0
+    def compute_probability(self, probabilities: list[list[float]]) -> None:
+        """Compute probability of the state given the probabilities of the propositions.
+        Args:
+            probabilities (list): list of probabilities of the propositions
+                e.g. two propositions with three frames
+                -> [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]].
+        """
+        probability = 1.0
+        for i in range(len(self.label)):
+            if self.label[i] == "T":
+                probability *= probabilities[i][self.frame_index]
+            else:
+                probability *= 1 - probabilities[i][self.frame_index]
+        self.probability = round(probability, 3)

ns_vfs/nsvs.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import numpy as np
+import warnings
+import tqdm
+import os
+from ns_vfs.model_checker.property_checker import PropertyChecker
+from ns_vfs.model_checker.video_automaton import VideoAutomaton
+from ns_vfs.video.frame import FramesofInterest
+from ns_vfs.vlm.vllm_client import VLLMClient
+from ns_vfs.video.frame import VideoFrame
+from ns_vfs.vlm.internvl import InternVL
+PRINT_ALL = True
+warnings.filterwarnings("ignore")
+def run_nsvs(
+    frames: list[np.ndarray],
+    proposition: list,
+    specification: str,
+    model_name: str = "InternVL2-8B",
+    device: int = 0,
+    model_type: str = "dtmc",
+    num_of_frame_in_sequence = 3,
+    tl_satisfaction_threshold: float = 0.6,
+    detection_threshold: float = 0.5,
+    vlm_detection_threshold: float = 0.35,
+    image_output_dir: str = "output"
+):
+    """Find relevant frames from a video that satisfy a specification"""
+    object_frame_dict = {}
+    vlm = VLLMClient()
+    # vlm = InternVL(model_name=model_name, device=device)
+    automaton = VideoAutomaton(include_initial_state=True)
+    automaton.set_up(proposition_set=proposition)
+    checker = PropertyChecker(
+        proposition=proposition,
+        specification=specification,
+        model_type=model_type,
+        tl_satisfaction_threshold=tl_satisfaction_threshold,
+        detection_threshold=detection_threshold
+    )
+    frame_of_interest = FramesofInterest(num_of_frame_in_sequence)
+    frame_windows = []
+    for i in range(0, len(frames), num_of_frame_in_sequence):
+        frame_windows.append(frames[i : i + num_of_frame_in_sequence])
+    def process_frame(sequence_of_frames: list[np.ndarray], frame_count: int):
+        object_of_interest = {}
+        for prop in proposition:
+            detected_object = vlm.detect(
+                seq_of_frames=sequence_of_frames,
+                scene_description=prop,
+                threshold=vlm_detection_threshold
+            )
+            object_of_interest[prop] = detected_object
+            if detected_object.is_detected:
+                multi_frame_arr = [frame_count * num_of_frame_in_sequence + j for j in range(num_of_frame_in_sequence)]
+                if prop in object_frame_dict:
+                    object_frame_dict[prop].extend(multi_frame_arr)
+                else:
+                    object_frame_dict[prop] = multi_frame_arr
+                if PRINT_ALL:
+                    print(f"\t{prop}: {detected_object.confidence}->{detected_object.probability}")
+        frame = VideoFrame(
+            frame_idx=frame_count,
+            frame_images=sequence_of_frames,
+            object_of_interest=object_of_interest,
+        )
+        return frame
+    if PRINT_ALL:
+        looper = enumerate(frame_windows)
+    else:
+        looper = tqdm.tqdm(enumerate(frame_windows), total=len(frame_windows))
+    for i, sequence_of_frames in looper:
+        if PRINT_ALL:
+            print("\n" + "*"*50 + f" {i}/{len(frame_windows)-1} " + "*"*50)
+            print("Detections:")
+        frame = process_frame(sequence_of_frames, i)
+        if PRINT_ALL:
+            os.makedirs(image_output_dir, exist_ok=True)
+            frame.save_frame_img(save_path=os.path.join(image_output_dir, f"{i}"))
+        if checker.validate_frame(frame_of_interest=frame):
+            automaton.add_frame(frame=frame)
+            frame_of_interest.frame_buffer.append(frame)
+            model_check = checker.check_automaton(automaton=automaton)
+            if model_check:
+                automaton.reset()
+                frame_of_interest.flush_frame_buffer()
+    foi = frame_of_interest.foi_list
+    if PRINT_ALL:
+        print("\n" + "-"*107)
+        print("Detected frames of interest:")
+        print(foi)
+    return foi, object_frame_dict

ns_vfs/puls/__init__.py ADDED Viewed

File without changes

ns_vfs/puls/llm.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import datetime
+import json
+import os
+class LLM:
+    def __init__(self, client, save_dir=""): # pass in save_dir to start saving
+        self.client = client
+        self.history = []
+        self.save_dir = save_dir
+        if save_dir != "":
+            os.makedirs(save_dir, exist_ok=True)
+    def prompt(self, p, openai_model):
+        user_message = {"role": "user", "content": [{"type": "text", "text": p}]}
+        self.history.append(user_message)
+        response = self.client.chat.completions.create(
+            model=openai_model,
+            messages=self.history,
+            store=False,
+        )
+        assistant_response = response.choices[0].message.content
+        assistant_message = {"role": "assistant", "content": [{"type": "text", "text": assistant_response}]}
+        self.history.append(assistant_message)
+        return assistant_response
+    def save_history(self, filename="conversation_history.json"):
+        if self.save_dir == "":
+            return None
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        base_name, extension = os.path.splitext(filename)
+        timestamped_filename = f"{base_name}_{timestamp}{extension}"
+        save_path = os.path.join(self.save_dir, timestamped_filename)
+        try:
+            with open(save_path, "w", encoding="utf-8") as f:
+                json.dump(self.history, f, indent=4, ensure_ascii=False)
+            return save_path
+        except Exception as e:
+            print(f"Failed to save conversation history: {e}")
+            return None

ns_vfs/puls/prompts.py ADDED Viewed

	@@ -0,0 +1,62 @@

+def find_prompt(prompt):
+    full_prompt = f"""
+You are an intelligent agent designed to extract structured representations from video description prompts. You will operate in two stages: (1) proposition extraction and (2) TL specification generation.
+Stage 1: Proposition Extraction
+Given an input prompt describing a sequence in a video, extract the atomic propositions that describe the underlying events or facts explicity referenced. These propositions should describe the combined semantics of object-action or object-object relationships stated in the prompt — avoid making assumptions or inferring any additional events. Avoid TL keywords such as 'and', 'or', 'not', 'until'.
+For example, given the prompt "A man is eating until he gets up", the correct propositions are ["man eats", "man gets up"].
+Stage 2: TL Specification Generation
+Using only the list of the propositions extracted in Stage 1, generate a single Temporal Logic (TL) specification that catpures the sequence of logical structure implied by the initial prompt.
+Rules:
+- The formula must use each proposition **exactly once**
+- Use only the TL operators: `AND`, `OR`, `NOT`, `UNTIL`
+- Do **not** infer new events or rephrase propositions.
+- The formula should reflect the temporal or logical relationships between the propositions in a way that makes semantic sense.
+**Examples**
+Example 1: "A child is playing with his kite and running around before he unfortunately falls down"
+Output:
+{{
+  "proposition": ["child plays with kite", "child runs around", "child falls"],
+  "specification": "(child plays with kite AND child runs around) UNTIL child falls"
+}}
+Example 2: "In a dimly lit room, two robots stand silently. Suddenely, either the red robot starts blinking or the green robot does not turn off."
+Output:
+{{
+  "proposition": ["robots stand silently", "red robot starts blinking", "green robot turns off"],
+  "specification": "robots stand silently UNTIL (red robot starts blinking OR NOT green robot turns off)"
+}}
+Example 3: "Inside a cave, a man holds a lantern. A minute after, he suddenely sees a dragon."
+Output:
+{{
+  "proposition": ["man holds lantern", "man sees dragon"],
+  "specification": "man holds lantern UNTIL man sees dragon"
+}}
+Example 6: "The girl is turning on the computer."
+Output:
+{{
+  "proposition": ["girl turns on computer"],
+  "specification": "(girl turns on computer)"
+}}
+**Now process the following prompt:**
+Input:
+{{
+  "prompt": "{prompt}"
+}}
+Expected Output (only output the following JSON structure — nothing else):
+{{
+  "proposition": [...],
+  "specification": "..."
+}}
+"""
+    return full_prompt

ns_vfs/puls/puls.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from ns_vfs.puls.llm import *
+from ns_vfs.puls.prompts import *
+from openai import OpenAI
+import json
+import os
+import re
+def clean_and_parse_json(raw_str):
+    start = raw_str.find('{')
+    end = raw_str.rfind('}') + 1
+    json_str = raw_str[start:end]
+    return json.loads(json_str)
+def process_specification(specification, propositions):
+    new_propositions = []
+    for prop in propositions:
+        prop_cleaned = re.sub(r"^[^a-zA-Z]+|[^a-zA-Z]+$", "", prop)
+        prop_cleaned = re.sub(r"\s+", "_", prop_cleaned)
+        prop_cleaned = prop_cleaned.replace("'", "")
+        new_propositions.append(prop_cleaned)
+    for original, new in zip(propositions, new_propositions):
+        specification = specification.replace(original, f'"{new}"')
+    replacements = {
+        "AND": "&",
+        "OR": "|",
+        "UNTIL": "U",
+        "ALWAYS": "G",
+        "EVENTUALLY": "F",
+        "NOT": "!"
+    }
+    for word, symbol in replacements.items():
+        specification = specification.replace(word, symbol)
+    return new_propositions, specification
+def PULS(query, openai_save_path, openai_model="o1-mini", openai_key=None):
+    if openai_key:
+        os.environ["OPENAI_API_KEY"] = openai_key
+    client = OpenAI()
+    llm = LLM(client, save_dir=openai_save_path)
+    full_prompt = find_prompt(query)
+    llm_output = llm.prompt(full_prompt, openai_model)
+    parsed = clean_and_parse_json(llm_output)
+    final_output = {}
+    cleaned_props, processed_spec = process_specification(parsed["specification"], parsed["proposition"])
+    final_output["proposition"] = cleaned_props
+    final_output["specification"] = processed_spec
+    saved_path = llm.save_history()
+    final_output["saved_path"] = saved_path
+    return final_output

ns_vfs/video/frame.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import List
+import numpy as np
+import cv2
+class VideoFrame:
+    """Frame class."""
+    def __init__(
+        self,
+        frame_idx: int,
+        frame_images: List[np.ndarray],
+        object_of_interest: dict
+    ):
+        self.frame_idx = frame_idx
+        self.frame_images = frame_images
+        self.object_of_interest = object_of_interest
+    def save_frame_img(self, save_path: str) -> None:
+        """Save frame image."""
+        if self.frame_images is not None:
+            for idx, img in enumerate(self.frame_images):
+                cv2.imwrite(f"{save_path}_{idx}.png", cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+    def thresholded_detected_objects(self, threshold) -> dict:
+        """Get all detected object."""
+        detected_obj = {}
+        for prop in self.object_of_interest.keys():
+            probability = self.object_of_interest[prop].get_detected_probability()
+            if probability > threshold:
+                detected_obj[prop] = probability
+        return detected_obj
+class FramesofInterest:
+    def __init__(self, num_of_frame_in_sequence):
+        self.num_of_frame_in_sequence = num_of_frame_in_sequence
+        self.foi_list = []
+        self.frame_buffer = []
+    def flush_frame_buffer(self):
+        """Flush frame buffer to frame of interest."""
+        if self.frame_buffer:
+            frame_interval = [frame.frame_idx for frame in self.frame_buffer]
+            self.foi_list.append([
+                i*self.num_of_frame_in_sequence + j
+                for i in frame_interval
+                for j in range(self.num_of_frame_in_sequence)
+            ])
+            self.frame_buffer = []

ns_vfs/video/read_mp4.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from typing import List, Dict, Any
+from tqdm import tqdm
+import numpy as np
+import cv2
+import os
+from ns_vfs.video.reader import VideoFormat, VideoInfo, Reader
+from ns_vfs.puls.puls import PULS
+class Mp4Reader(Reader):
+    def __init__(self, videos: List[Dict[str, str]], openai_save_path: str, sampling_rate_fps: float = 1.0):
+        self.videos = videos
+        if sampling_rate_fps is None or sampling_rate_fps <= 0:
+            raise ValueError("sampling_rate_fps must be > 0")
+        self.openai_save_path = openai_save_path
+        self.sampling_rate_fps = float(sampling_rate_fps)
+    def _sampled_frame_indices(self, fps: float, frame_count: int) -> List[int]:
+        if fps <= 0:
+            fps = 1.0
+        duration_sec = frame_count / fps if frame_count > 0 else 0.0
+        step_sec = 1.0 / self.sampling_rate_fps
+        times = [t for t in np.arange(0.0, duration_sec + 1e-9, step_sec)]
+        idxs = sorted(set(int(round(t * fps)) for t in times if t * fps < frame_count))
+        if not idxs and frame_count > 0:
+            idxs = [0]
+        return idxs
+    def _read_one(self, video_query: Dict[str, str]) -> Dict[str, Any] | None:
+        path = video_query["path"]
+        query = video_query["query"]
+        cap = cv2.VideoCapture(path)
+        if not cap.isOpened():
+            return None
+        fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or 0)
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0)
+        frame_idxs = self._sampled_frame_indices(fps, frame_count)
+        images: List[np.ndarray] = []
+        for idx in frame_idxs:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ok, frame_bgr = cap.read()
+            if not ok or frame_bgr is None:
+                continue
+            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+            images.append(frame_rgb)
+        if (width == 0 or height == 0) and images:
+            height, width = images[0].shape[:2]
+        video_info = VideoInfo(
+            format=VideoFormat.MP4,
+            frame_width=width,
+            frame_height=height,
+            frame_count=frame_count,
+            fps=float(fps) if fps else None,
+        )
+        puls_output = PULS(query, self.openai_save_path, openai_key=OPENAI_API_KEY)
+        cap.release()
+        entry = {
+            "tl": {
+                "propositions": puls_output["proposition"],
+                "specification": puls_output["specification"],
+                "query": query,
+            },
+            "metadata": {
+                "video_path": path,
+                "sampling_rate_fps": self.sampling_rate_fps,
+                "puls_saved_path": puls_output["saved_path"],
+            },
+            "video_info": video_info,
+            "images": images,
+        }
+        return entry
+    def read_video(self) -> List[Dict[str, Any]]:
+        results: List[Dict[str, Any]] = []
+        with tqdm(total=len(self.videos), desc="Reading MP4s") as pbar:
+            for v in self.videos:
+                entry = self._read_one(v)
+                if entry is not None:
+                    results.append(entry)
+                pbar.update(1)
+        return results

ns_vfs/video/read_tlv.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import List, Dict, Any, Iterable
+from tqdm import tqdm
+import numpy as np
+import pickle
+import os
+from ns_vfs.video.reader import VideoFormat, VideoInfo, Reader
+class TLVReader(Reader):
+    def __init__(self, tlv_path: str):
+        self.tlv_path = tlv_path # /nas/dataset/tlv-dataset-v1
+    def _iter_tlv(self) -> Iterable[tuple[str, str, str]]:
+        for dataset_dir in os.listdir(self.tlv_path):
+            dataset_path = os.path.join(self.tlv_path, dataset_dir)
+            if not os.path.isdir(dataset_path):
+                continue
+            for format_dir in os.listdir(dataset_path):
+                format_path = os.path.join(dataset_path, format_dir)
+                if not os.path.isdir(format_path):
+                    continue
+                for fname in os.listdir(format_path):
+                    if fname.endswith(".pkl"):
+                        yield dataset_dir, format_dir, os.path.join(format_path, fname)
+    def read_video(self) -> List[Dict[str, Any]]:
+        entries: List[Dict[str, Any]] = []
+        total = sum(1 for _ in self._iter_tlv())
+        with tqdm(total=total, desc="Loading TLV files") as pbar:
+            for dataset_dir, format_dir, file_path in self._iter_tlv():
+                with open(file_path, "rb") as f:
+                    raw = pickle.load(f)
+                images: List[np.ndarray] = raw["images_of_frames"]
+                if len(images) == 0:
+                    pbar.update(1)
+                    continue
+                video_info = VideoInfo(
+                    format=VideoFormat.LIST_OF_ARRAY,
+                    frame_width=images[0].shape[1],
+                    frame_height=images[0].shape[0],
+                    frame_count=len(images),
+                    fps=0.1, # 1 frame/10 sec
+                )
+                entry = {
+                    "tl": {
+                        "propositions": raw["proposition"],
+                        "specification": raw["ltl_formula"],
+                        "query": self.formatter(raw["ltl_formula"]),
+                    },
+                    "metadata": {
+                        "type": {"dataset": dataset_dir, "format": format_dir},
+                        "ground_truth": [i for sub in raw["frames_of_interest"] for i in sub],
+                    },
+                    "video_info": video_info,
+                    "images": images,
+                }
+                entries.append(entry)
+                pbar.update(1)
+        return entries

ns_vfs/video/reader.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from dataclasses import dataclass, field, asdict
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any
+import enum
+import uuid
+class VideoFormat(enum.Enum):
+    MP4 = "mp4"
+    LIST_OF_ARRAY = "list_of_array"
+@dataclass
+class VideoInfo:
+    format: VideoFormat
+    frame_width: int
+    frame_height: int
+    frame_count: int
+    video_id: uuid.UUID = field(default_factory=uuid.uuid4)
+    fps: float | None = None
+    def to_dict(self):
+        d = asdict(self)
+        d["video_id"] = str(self.video_id)
+        d["format"] = self.format.value
+        return d
+class Reader(ABC):
+    @abstractmethod
+    def read_video(self) -> List[Dict[str, Any]]:
+        pass
+    def formatter(self, spec: str) -> str:
+        spec = spec.replace("&", " and ")
+        spec = spec.replace("|", " or ")
+        spec = spec.replace("U", " until ")
+        spec = spec.replace("F", " eventually ")
+        spec = spec.replace("G", " always ")
+        spec = spec.replace("X", " next ")
+        spec = spec.replace('"', "")
+        spec = spec.replace("'", "")
+        spec = spec.replace("(", "")
+        spec = spec.replace(")", "")
+        while "  " in spec:
+            spec = spec.replace("  ", " ")
+        spec = spec.strip()
+        return spec

ns_vfs/vlm/__init__.py ADDED Viewed

File without changes

ns_vfs/vlm/internvl.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import gc
+import logging
+import numpy as np
+import copy
+import torch
+from torch.nn.functional import softmax
+from transformers import AutoModel, AutoTokenizer
+from ns_vfs.vlm.internvl_utils import (
+    assign_device_map,
+    load_video_from_seq_of_frames,
+    split_model,
+)
+from ns_vfs.vlm.obj import DetectedObject
+class InternVL:
+    """InternVL's Vision Language Model."""
+    def __init__(
+        self,
+        model_name: str = "InternVL2-8B",
+        multi_gpus: bool = False,
+        device: int = 0,
+    ) -> None:
+        """Initialization the InternVL."""
+        logging.info(
+            (
+                "You are using the model based on HuggingFace API.",
+                "The model will be downloaded to the HuggingFace cache dir.",
+            )
+        )
+        self.model_name = model_name
+        self._path = f"OpenGVLab/{model_name}"
+        self._num_gpus = torch.cuda.device_count()
+        self.device = device
+        if multi_gpus:
+            device_map = split_model(model_name)
+        else:
+            device_map = assign_device_map(model_name=model_name, manual_gpu_id=device)
+        self.model = AutoModel.from_pretrained(
+            self._path,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            use_flash_attn=True,
+            trust_remote_code=True,
+            device_map=device_map,
+        ).eval()
+        self.model.apply(self.move_tensors_to_gpu)
+        self.tokenizer = AutoTokenizer.from_pretrained(self._path, trust_remote_code=True, use_fast=False)
+    def reset_model(self) -> None:
+        """Reset the model to its initial state using pretrained weights."""
+        self.model = AutoModel.from_pretrained(
+            self._path,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            use_flash_attn=True,
+            trust_remote_code=True,
+        ).eval()
+        self.model.apply(self.move_tensors_to_gpu)
+    def clear_gpu_memory(self) -> None:
+        """Clear CUDA cache and run garbage collection to free GPU memory."""
+        torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.ipc_collect()
+        gc.collect()  # Run garbage collector
+    def move_tensors_to_gpu(
+        self,
+        module: torch.nn.Module,
+    ) -> None:
+        """Move all tensors in the module to GPU if they are on the CPU."""
+        for name, tensor in module.named_buffers():
+            if isinstance(tensor, torch.Tensor) and tensor.device.type == "cpu":
+                module.register_buffer(
+                    name,
+                    tensor.cuda(self.device),
+                    persistent=False,
+                )
+        for _, param in module.named_parameters():
+            if param.device.type == "cpu":
+                param.data = param.data.cuda(self.device)
+    def detect(
+        self,
+        seq_of_frames: list[np.ndarray],
+        scene_description: str,
+        threshold: float
+    ) -> DetectedObject:
+        """Detect objects in the given frame image.
+        Args:
+            seq_of_frames (list[np.ndarray]): List of video frames to process.
+            scene_description (str): Description of the scene.
+            threshold (float): Detection threshold.
+        Returns:
+            DetectedObject: Detected objects with their details.
+        """
+        parsing_rule = "You must only return a Yes or No, and not both, to any question asked. You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times. For example, if the question is \"Is there a cat present in the sequence of images?\", the answer must only be 'Yes' or 'No'."
+        prompt = rf"Is there a {scene_description} present in the sequence of images? " f"\n[PARSING RULE]: {parsing_rule}"
+        response, confidence = self.infer_with_video_confidence(
+            language=prompt,
+            seq_of_frames=seq_of_frames,
+        )
+        detected = "yes" in response.lower()
+        probability = self.calibrate(confidence, false_threshold=threshold)
+        return DetectedObject(
+            name=scene_description,
+            is_detected=detected,
+            confidence=round(confidence, 3),
+            probability=round(probability, 3),
+        )
+    def infer_with_video_confidence(
+        self,
+        language: str,
+        seq_of_frames: list[np.ndarray],
+        max_new_tokens: int = 1024,
+        do_sample: bool = True,
+    ) -> tuple[str, float]:
+        """Perform video inference and return response with confidence score.
+        Args:
+            language (str): The input prompt or question.
+            seq_of_frames (list[np.ndarray] | None):
+                List of video frames as numpy arrays.
+            video_path (str | None): Path to the input video file.
+            max_new_tokens (int): Maximum number of new tokens to generate.
+            do_sample (bool): Whether to use sampling for generation.
+        Returns:
+            tuple[str, float]: Generated response and confidence score.
+        """
+        generation_config = {
+            "max_new_tokens": max_new_tokens,
+            "do_sample": do_sample,
+        }
+        pixel_values, num_patches_list = load_video_from_seq_of_frames(
+            seq_of_frames=seq_of_frames, device=self.device
+        )
+        video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
+        language = video_prefix + language
+        return self.chat_with_confidence(
+            self.tokenizer,
+            pixel_values,
+            language,
+            generation_config,
+            num_patches_list=num_patches_list,
+        )
+    def chat_with_confidence(
+        self,
+        tokenizer: AutoTokenizer,
+        pixel_values: torch.Tensor,
+        question: str,
+        generation_config: dict,
+        num_patches_list: list[int] | None = None,
+        IMG_START_TOKEN: str = "<img>",
+        IMG_END_TOKEN: str = "</img>",
+        IMG_CONTEXT_TOKEN: str = "<IMG_CONTEXT>",
+        verbose: bool = False,
+    ) -> tuple[str, float]:
+        """Generate a response with confidence score for the given input.
+        Args:
+            tokenizer: The tokenizer to use.
+            pixel_values: Image tensor input.
+            question: The input question or prompt.
+            generation_config: Configuration for text generation.
+            num_patches_list: List of number of patches for video frames.
+            IMG_START_TOKEN: Token to mark the start of an image.
+            IMG_END_TOKEN: Token to mark the end of an image.
+            IMG_CONTEXT_TOKEN: Token for image context.
+            verbose: Whether to print verbose output.
+        Returns:
+            A tuple containing the generated response and its confidence score.
+        """
+        if num_patches_list is None:
+            num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
+        assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
+        img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.model.img_context_token_id = img_context_token_id
+        template = copy.deepcopy(self.model.conv_template)
+        template.system_message = self.model.system_message
+        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
+        template.append_message(template.roles[0], question)
+        template.append_message(template.roles[1], None)
+        query = template.get_prompt()
+        if verbose and pixel_values is not None:
+            image_bs = pixel_values.shape[0]
+            print(f"dynamic ViT batch size: {image_bs}")
+        for num_patches in num_patches_list:
+            context_tokens = IMG_CONTEXT_TOKEN * self.model.num_image_token * num_patches
+            image_tokens = IMG_START_TOKEN + context_tokens + IMG_END_TOKEN
+            query = query.replace("<image>", image_tokens, 1)
+        model_inputs = tokenizer(query, return_tensors="pt")
+        input_ids = model_inputs["input_ids"].cuda(self.device)
+        attention_mask = model_inputs["attention_mask"].cuda(self.device)
+        generation_config["eos_token_id"] = eos_token_id
+        generation_config["return_dict_in_generate"] = True
+        generation_config["output_scores"] = True
+        generation_config["output_logits"] = True
+        generation_output = self.model.generate(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **generation_config,
+        )
+        response = tokenizer.batch_decode(generation_output.sequences, skip_special_tokens=True)[0]
+        response = response.split(template.sep)[0].strip()
+        logits_to_compute = np.where(generation_output.sequences[0].detach().cpu().numpy() != eos_token_id)[0]
+        confidence = 1.0
+        for logit in logits_to_compute:
+            token = generation_output.sequences[0, logit].item()
+            prob = softmax(generation_output.logits[logit], dim=0)[0, token]
+            confidence = prob.item() * confidence
+        self.clear_gpu_memory()
+        return response, confidence
+    def calibrate(
+        self,
+        confidence: float,
+        true_threshold=0.95,
+        false_threshold=0.40,
+        target_conf=0.60,
+        target_prob=0.78,
+        p_min=0.01,
+        p_max=0.99,
+        steepness_factor=0.7,
+    ) -> float:
+        """Map confidence to probability using a sigmoid function with adjustable steepness.
+        Args:
+            confidence: Input confidence score
+            true_threshold: Upper threshold
+            false_threshold: Lower threshold
+            target_conf: Target confidence point
+            target_prob: Target probability value
+            p_min: Minimum probability
+            p_max: Maximum probability
+            steepness_factor: Controls curve steepness (0-1, lower = less steep)
+        """
+        if confidence <= false_threshold:
+            return p_min
+        if confidence >= true_threshold:
+            return p_max
+        # Calculate parameters to ensure target_conf maps to target_prob
+        # For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0)))
+        # First, normalize the target point
+        x_norm = (target_conf - false_threshold) / (true_threshold - false_threshold)
+        y_norm = (target_prob - p_min) / (p_max - p_min)
+        # Find x0 (midpoint) and k (steepness) to satisfy our target point
+        x0 = 0.30  # Midpoint of normalized range
+        # Calculate base k value to hit the target point
+        base_k = -np.log(1 / y_norm - 1) / (x_norm - x0)
+        # Apply steepness factor (lower = less steep)
+        k = base_k * steepness_factor
+        # With reduced steepness, we need to adjust x0 to still hit the target point
+        # Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k
+        adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k
+        # Apply the sigmoid with our calculated parameters
+        x_scaled = (confidence - false_threshold) / (true_threshold - false_threshold)
+        sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0)))
+        # Ensure we still hit exactly p_min and p_max at the thresholds
+        # by rescaling the output slightly
+        min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0)))
+        max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0)))
+        # Normalize the output
+        normalized = (sigmoid_value - min_val) / (max_val - min_val)
+        return p_min + normalized * (p_max - p_min)

ns_vfs/vlm/internvl_utils.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import math
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size: int) -> T.Compose:
+    """Builds a transformation pipeline for the given input size."""
+    mean, std = IMAGENET_MEAN, IMAGENET_STD
+    return T.Compose(
+        [
+            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+            T.Resize(
+                (input_size, input_size),
+                interpolation=InterpolationMode.BICUBIC,
+            ),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std),
+        ]
+    )
+def assign_device_map(model_name, manual_gpu_id=0):
+    device_map = {}
+    world_size = torch.cuda.device_count()
+    num_layers = {
+        "InternVL2-1B": 24,
+        "InternVL2-2B": 24,
+        "InternVL2-4B": 32,
+        "InternVL2-8B": 32,
+        "InternVL2-26B": 48,
+        "InternVL2-40B": 60,
+        "InternVL2-Llama3-76B": 80,
+    }[model_name]
+    for layer_idx in range(num_layers):
+        device_map[f"language_model.model.layers.{layer_idx}"] = manual_gpu_id
+    device_map["vision_model"] = manual_gpu_id
+    device_map["mlp1"] = manual_gpu_id
+    device_map["language_model.model.tok_embeddings"] = manual_gpu_id
+    device_map["language_model.model.embed_tokens"] = manual_gpu_id
+    device_map["language_model.output"] = manual_gpu_id
+    device_map["language_model.model.norm"] = manual_gpu_id
+    device_map["language_model.lm_head"] = manual_gpu_id
+    device_map[f"language_model.model.layers.{num_layers - 1}"] = manual_gpu_id
+    return device_map
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    # Convert numpy array to PIL Image if needed
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def split_model(model_name):
+    device_map = {}
+    world_size = torch.cuda.device_count()
+    num_layers = {
+        "InternVL2-1B": 24,
+        "InternVL2-2B": 24,
+        "InternVL2-4B": 32,
+        "InternVL2-8B": 32,
+        "InternVL2-26B": 48,
+        "InternVL2-40B": 60,
+        "InternVL2-Llama3-76B": 80,
+    }[model_name]
+    # Since the first GPU will be used for ViT, treat it as half a GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
+    num_layers_per_gpu = [num_layers_per_gpu] * world_size
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f"language_model.model.layers.{layer_cnt}"] = i
+            layer_cnt += 1
+    device_map["vision_model"] = 0
+    device_map["mlp1"] = 0
+    device_map["language_model.model.tok_embeddings"] = 0
+    device_map["language_model.model.embed_tokens"] = 0
+    device_map["language_model.output"] = 0
+    device_map["language_model.model.norm"] = 0
+    device_map["language_model.lm_head"] = 0
+    device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
+    return device_map
+def move_tensors_to_gpu(module):
+    for name, tensor in module.named_buffers():
+        if isinstance(tensor, torch.Tensor) and tensor.device.type == "cpu":
+            module.register_buffer(name, tensor.cuda(), persistent=False)
+    for _, param in module.named_parameters():
+        if param.device.type == "cpu":
+            param.data = param.data.cuda()
+# video multi-round conversation (视频多轮对话)
+def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array(
+        [int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)]
+    )
+    return frame_indices
+def load_video_from_seq_of_frames(
+    seq_of_frames: list[np.ndarray],
+    input_size=448,
+    max_num=1,
+    device="cuda",
+    dtype=torch.bfloat16,
+):
+    pixel_values_list, num_patches_list = [], []
+    transform = build_transform(input_size=input_size)
+    for img in seq_of_frames:
+        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values).to(dtype=dtype, device=device)  # Convert to bfloat16
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values)
+    return torch.cat(pixel_values_list), num_patches_list
+def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    pixel_values_list, num_patches_list = [], []
+    transform = build_transform(input_size=input_size)
+    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
+        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values)
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values.to(torch.bfloat16))
+    pixel_values = torch.cat(pixel_values_list)
+    return pixel_values, num_patches_list

ns_vfs/vlm/obj.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from typing import Any
+import logging
+class DetectedObject:
+    """Detected Object class."""
+    def __init__(self,
+                 name: str,
+                 is_detected: bool,
+                 confidence: float,
+                 probability: float,
+                 model_name: str | None = None,
+                 bounding_box_of_all_obj: list[Any] | None = None):
+        self.name = name
+        self.confidence = confidence
+        self.probability = probability
+        self.is_detected = is_detected
+        self.model_name = model_name
+        self.bounding_box_of_all_obj = bounding_box_of_all_obj
+    def __str__(self) -> str:
+        return f"Object: {self.name}, Detected: {self.is_detected}, Probability: {self.get_detected_probability()}"
+    def get_detected_probability(self) -> float:
+        if not self.is_detected:
+            return 0
+        if self.probability > 0:
+            return self.probability
+        if self.confidence > 0 and self.probability == 0:
+            logging.info("Probability is not set, using confidence: %f", self.confidence)
+            return self.confidence
+        return self.probability

ns_vfs/vlm/vllm_client.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import base64
+from openai import OpenAI
+import numpy as np
+import math
+import cv2
+from ns_vfs.vlm.obj import DetectedObject
+class VLLMClient:
+    def __init__(
+        self,
+        api_key="EMPTY",
+        api_base="http://localhost:8000/v1",
+        model="OpenGVLab/InternVL2-8B",
+        # model="Qwen/Qwen2.5-VL-7B-Instruct",
+    ):
+        self.client = OpenAI(api_key=api_key, base_url=api_base)
+        self.model = model
+    # def _encode_frame(self, frame):
+    #     return base64.b64encode(frame.tobytes()).decode("utf-8")
+    def _encode_frame(self, frame):
+        # Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.
+        ret, buffer = cv2.imencode(".jpg", frame)
+        if not ret:
+            raise ValueError("Could not encode frame")
+        return base64.b64encode(buffer).decode("utf-8")
+    def detect(
+        self,
+        seq_of_frames: list[np.ndarray],
+        scene_description: str,
+        threshold: float
+    ) -> DetectedObject:
+        parsing_rule = "You must only return a Yes or No, and not both, to any question asked. You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times. For example, if the question is \"Is there a cat present in the sequence of images?\", the answer must only be 'Yes' or 'No'."
+        prompt = rf"Is there a {scene_description} present in the sequence of images? " f"\n[PARSING RULE]: {parsing_rule}"
+        # Encode each frame.
+        encoded_images = [self._encode_frame(frame) for frame in seq_of_frames]
+        # Build the user message: a text prompt plus one image for each frame.
+        user_content = [
+            {
+                "type": "text",
+                "text": f"The following is the sequence of images",
+            }
+        ]
+        for encoded in encoded_images:
+            user_content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
+                }
+            )
+        # Create a chat completion request.
+        chat_response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": prompt},
+                {"role": "user", "content": user_content},
+            ],
+            max_tokens=1,
+            temperature=0.0,
+            logprobs=True,
+            top_logprobs=20,
+        )
+        content = chat_response.choices[0].message.content
+        is_detected = "yes" in content.lower()
+        # Retrieve the list of TopLogprob objects.
+        top_logprobs_list = chat_response.choices[0].logprobs.content[0].top_logprobs
+        # Build a mapping from token text (stripped) to its probability.
+        token_prob_map = {}
+        for top_logprob in top_logprobs_list:
+            token_text = top_logprob.token.strip()
+            token_prob_map[token_text] = np.exp(top_logprob.logprob)
+        # Extract probabilities for "Yes" and "No"
+        yes_prob = token_prob_map.get("Yes", 0.0)
+        no_prob = token_prob_map.get("No", 0.0)
+        # Compute the normalized probability for "Yes": p_yes / (p_yes + p_no)
+        if yes_prob + no_prob > 0:
+            confidence = yes_prob / (yes_prob + no_prob)
+        else:
+            raise ValueError("No probabilities for 'Yes' or 'No' found in the response.")
+        # print(f"Is detected: {is_detected}")
+        # print(f"Confidence: {confidence:.3f}")
+        probability = self.calibrate(confidence=confidence, false_threshold=threshold)
+        return DetectedObject(
+            name=scene_description,
+            is_detected=is_detected,
+            confidence=round(confidence, 3),
+            probability=round(probability, 3)
+        )
+    def calibrate(
+        self,
+        confidence: float,
+        true_threshold=0.95,
+        false_threshold=0.40,
+        target_conf=0.60,
+        target_prob=0.78,
+        p_min=0.01,
+        p_max=0.99,
+        steepness_factor=0.7,
+    ) -> float:
+        """Map confidence to probability using a sigmoid function with adjustable steepness.
+        Args:
+            confidence: Input confidence score
+            true_threshold: Upper threshold
+            false_threshold: Lower threshold
+            target_conf: Target confidence point
+            target_prob: Target probability value
+            p_min: Minimum probability
+            p_max: Maximum probability
+            steepness_factor: Controls curve steepness (0-1, lower = less steep)
+        """
+        if confidence <= false_threshold:
+            return p_min
+        if confidence >= true_threshold:
+            return p_max
+        # Calculate parameters to ensure target_conf maps to target_prob
+        # For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0)))
+        # First, normalize the target point
+        x_norm = (target_conf - false_threshold) / (true_threshold - false_threshold)
+        y_norm = (target_prob - p_min) / (p_max - p_min)
+        # Find x0 (midpoint) and k (steepness) to satisfy our target point
+        x0 = 0.30  # Midpoint of normalized range
+        # Calculate base k value to hit the target point
+        base_k = -np.log(1 / y_norm - 1) / (x_norm - x0)
+        # Apply steepness factor (lower = less steep)
+        k = base_k * steepness_factor
+        # With reduced steepness, we need to adjust x0 to still hit the target point
+        # Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k
+        adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k
+        # Apply the sigmoid with our calculated parameters
+        x_scaled = (confidence - false_threshold) / (true_threshold - false_threshold)
+        sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0)))
+        # Ensure we still hit exactly p_min and p_max at the thresholds
+        # by rescaling the output slightly
+        min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0)))
+        max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0)))
+        # Normalize the output
+        normalized = (sigmoid_value - min_val) / (max_val - min_val)
+        return p_min + normalized * (p_max - p_min)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,22 @@

+[project]
+name = "nsvs"
+version = "0.1.0"
+requires-python = ">=3.13"
+dependencies = [
+    "accelerate>=1.10.1",
+    "cmake==3.29.6",
+    "decord>=0.6.0",
+    "einops>=0.8.1",
+    "gradio>=5.44.1",
+    "ninja>=1.13.0",
+    "numpy>=2.3.2",
+    "openai>=1.106.1",
+    "opencv-python>=4.11.0.86",
+    "safetensors>=0.6.2",
+    "sentencepiece>=0.2.1",
+    "stormpy>=1.10.1",
+    "timm>=1.0.19",
+    "tqdm>=4.67.1",
+    "transformers>=4.41,<4.47",
+]

scripts/no_nsvs.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from enum import Enum
+from tqdm import tqdm
+import json
+import time
+import os
+from ns_vfs.vlm.internvl import InternVL
+from ns_vfs.video.read_tlv import TLVReader
+class RunConfig(Enum):
+    SLIDING_WINDOW = "sliding_window"
+    FRAME_WISE = "frame_wise"
+CURRENT_CONFIG = RunConfig.SLIDING_WINDOW
+TLV_PATH = "/nas/dataset/tlv-dataset-v1"
+MODEL_NAME = "InternVL2-8B"
+DEVICE = 7  # GPU device index
+CALIBRATION_THRESHOLD = 0.349  # vllm threshold
+THRESHOLD = 0.5  # detection threshold (fw)
+STRIDE = 10  # slide stride (sw)
+WINDOW = 20  # window length (sw)
+def sliding_window(entry):  # answers "which sequence of `WINDOW` frames can best answer the query"
+    query = entry["tl"]["query"]
+    frames = entry["images"]
+    model = InternVL(model_name=MODEL_NAME, device=DEVICE)
+    best = {"prob": -1.0, "start": 1, "end": 1}
+    foi = []
+    t = 0
+    windows = list(range(0, len(frames), STRIDE))
+    with tqdm(windows, desc=f"Sliding window (stride={STRIDE}, window={WINDOW})") as pbar:
+        for t in pbar:
+            end_idx = min(t + WINDOW, len(frames))
+            seq = frames[t:end_idx]
+            detect = model.detect(seq, query, CALIBRATION_THRESHOLD)
+            prob = detect.probability
+            is_detected = detect.is_detected
+            pbar.set_postfix( {"best_prob": f"{best['prob']:.3f}", "current_prob": f"{prob:.3f}", "detected": is_detected} )
+            if prob > best["prob"] and is_detected:
+                best.update({"prob": prob, "start": t, "end": end_idx})
+    if best["prob"] != -1.0:
+        foi = list(range(best["start"], best["end"] + 1))
+    return foi
+def frame_wise(entry):
+    query = entry["tl"]["query"]
+    frames = entry["images"]
+    model = InternVL(model_name="InternVL2-8B", device=DEVICE)
+    foi = []
+    t = 0
+    windows = range(len(frames))
+    with tqdm(windows, desc=f"Framewise (threshold={THRESHOLD}") as pbar:
+        for t in pbar:
+            f = [frames[t]]
+            detect = model.detect(f, query, CALIBRATION_THRESHOLD)
+            prob = detect.probability
+            is_detected = detect.is_detected
+            pbar.set_postfix( {"current_prob": f"{prob:.3f}", "detected": is_detected} )
+            if prob > THRESHOLD and is_detected:
+                foi.append(t)
+    return foi
+def main():
+    reader = TLVReader(TLV_PATH)
+    data = reader.read_video()
+    if not data:
+        return
+    folder_name = f"{MODEL_NAME}_{CURRENT_CONFIG.value}"
+    folder_name = os.path.join("/nas/mars/experiment_result/nsvs/nsvs2-prelims", folder_name)
+    if not os.path.exists(folder_name):
+        os.makedirs(folder_name)
+    with tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
+        for i, entry in pbar:
+            start_time = time.time()
+            if CURRENT_CONFIG == RunConfig.SLIDING_WINDOW:
+                foi = sliding_window(entry)
+            else:
+                foi = frame_wise(entry)
+            end_time = time.time()
+            output = {
+                "propositions": entry["tl"]["propositions"],
+                "specification": entry["tl"]["specification"],
+                "ground_truth": entry["metadata"]["ground_truth"],
+                "frames_of_interest": foi,
+                "type": entry["metadata"]["type"],
+                "number_of_frames": entry["video_info"].frame_count,
+                "processting_time_seconds": round(end_time - start_time, 3),
+            }
+            with open(f"{folder_name}/output_{i}.json", "w") as f:
+                json.dump(output, f, indent=4)
+if __name__ == "__main__":
+    main()

scripts/plot.py ADDED Viewed

	@@ -0,0 +1,252 @@

+from scipy.interpolate import make_interp_spline
+import matplotlib.colors as mcolors
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import numpy as np
+import json
+import os
+folder1 = "/nas/mars/experiment_result/nsvs/nsvs2-prelims/nsvs"
+folder2 = "/nas/mars/experiment_result/nsvs/nsvs2-prelims/InternVL2-8B_frame_wise"
+folder3 = "/nas/mars/experiment_result/nsvs/nsvs2-prelims/InternVL2-8B_sliding_window"
+out_path_duration   = "scripts/plots/plot_duration.png"
+out_path_complexity = "scripts/plots/plot_complexity.png"
+labels = ["NSVS", "Frame-Wise", "Sliding-Window"]
+colors = ["#1f77b4", "#b4421f", "#2ca02c"]
+complexity_bins = [1, 2, 3]
+def _safe_json_load(path):
+    with open(path, "r") as f:
+        return json.load(f)
+def _per_file_stats(pred, gt):
+    tp = len(pred & gt)
+    fp = len(pred - gt)
+    fn = len(gt - pred)
+    precision_f = tp / (tp + fp) if (tp + fp) else 0.0
+    recall_f    = tp / (tp + fn) if (tp + fn) else 0.0
+    f1_file     = (2 * precision_f * recall_f / (precision_f + recall_f)
+                   if (precision_f + recall_f) else 0.0)
+    return tp, fp, fn, precision_f, recall_f, f1_file
+def _iter_json(folder):
+    for fname in os.listdir(folder):
+        if fname.endswith(".json"):
+            yield os.path.join(folder, fname)
+def compute_statistics(folders):
+    out = {}
+    for folder in folders:
+        TP = FP = FN = 0
+        per_file_f1 = []
+        for fpath in _iter_json(folder):
+            data = _safe_json_load(fpath)
+            pred = set(map(int, data.get("frames_of_interest", [])))
+            gt   = set(map(int, data.get("ground_truth", [])))
+            tp, fp, fn, _, _, f1_file = _per_file_stats(pred, gt)
+            TP += tp; FP += fp; FN += fn
+            per_file_f1.append(float(f1_file))
+        precision = TP / (TP + FP) if (TP + FP) else 0.0
+        recall    = TP / (TP + FN) if (TP + FN) else 0.0
+        f1        = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
+        out[folder] = {
+            "precision": precision,
+            "recall": recall,
+            "f1": f1,
+            "TP": TP, "FP": FP, "FN": FN,
+            "per_file_f1": per_file_f1,
+        }
+    # Pretty print
+    for folder, stats in out.items():
+        print(f"[{folder}] Overall metrics:")
+        print(f"  Precision: {stats['precision']:.4f}")
+        print(f"  Recall:    {stats['recall']:.4f}")
+        print(f"  F1:        {stats['f1']:.4f}\n")
+    return out
+def _collect_duration_points(folder):
+    xs, ys = [], []
+    for fpath in _iter_json(folder):
+        data = _safe_json_load(fpath)
+        pred = set(map(int, data.get("frames_of_interest", [])))
+        gt   = set(map(int, data.get("ground_truth", [])))
+        tp, fp, fn, _, _, f1 = _per_file_stats(pred, gt)
+        nframes = int(data.get("number_of_frames", 0))
+        if nframes <= 0:
+            continue
+        minutes = int(nframes / 4)  # your original definition
+        xs.append(minutes)
+        ys.append(float(f1))
+    return np.array(xs, dtype=int), np.array(ys, dtype=float)
+def _compute_envelope(xs, ys, bandwidth=20, smooth_band=600, smooth_center=11):
+    if len(xs) == 0:
+        return None
+    grouped = defaultdict(list)
+    for x, y in zip(xs, ys):
+        grouped[int(x)].append(float(y))
+    durations = np.array(sorted(grouped.keys()))
+    if len(durations) == 0:
+        return None
+    ymin_raw = np.array([min(grouped[d]) for d in durations])
+    ymax_raw = np.array([max(grouped[d]) for d in durations])
+    ymid_raw = (ymin_raw + ymax_raw) / 2.0
+    ymin_s, ymax_s = [], []
+    for d in durations:
+        mask = np.abs(durations - d) <= bandwidth
+        ymin_s.append(ymin_raw[mask].min())
+        ymax_s.append(ymax_raw[mask].max())
+    ymin_s, ymax_s = np.array(ymin_s), np.array(ymax_s)
+    ymid_s = (ymin_s + ymax_s) / 2.0
+    if len(durations) >= 4:
+        x_band = np.linspace(durations.min(), durations.max(), smooth_band)
+        ymin_smooth = make_interp_spline(durations, ymin_s, k=3)(x_band)
+        ymax_smooth = make_interp_spline(durations, ymax_s, k=3)(x_band)
+        x_center = np.linspace(durations.min(), durations.max(), smooth_center)
+        ymid_smooth = make_interp_spline(durations, ymid_s, k=3)(x_center)
+    else:
+        x_band, ymin_smooth, ymax_smooth = durations, ymin_s, ymax_s
+        x_center, ymid_smooth = durations, ymid_s
+    return x_band, ymin_smooth, ymax_smooth, x_center, ymid_smooth
+def _shrink_band(ymin_spline, ymax_spline, factor=0.5):
+    center = (ymin_spline + ymax_spline) / 2.0
+    ymin_new = center - factor * (center - ymin_spline)
+    ymax_new = center + factor * (ymax_spline - center)
+    return ymin_new, ymax_new, center
+def plot_duration(folders, labels, colors, out_path):
+    envs = []
+    for folder in folders:
+        xs, ys = _collect_duration_points(folder)
+        envs.append(_compute_envelope(xs, ys))
+    if all(env is None for env in envs):
+        print("Not enough data with valid 'number_of_frames' to plot.")
+        return
+    fig, ax = plt.subplots(figsize=(9, 6))
+    for env, lab, col in zip(envs, labels, colors):
+        if env is None:
+            continue
+        x_band, ymin_s, ymax_s, x_center, ymid_s = env
+        ymin_plot, ymax_plot, _ = _shrink_band(ymin_s, ymax_s, factor=0.5)
+        base = mcolors.to_rgb(col)
+        darker = tuple(max(0.0, c * 0.75) for c in base)
+        ax.fill_between(x_band, ymin_plot, ymax_plot, color=base, alpha=0.22)
+        ax.plot(x_center, ymid_s, linewidth=2.5, color=darker, label=lab)
+    ax.set_xlabel("Minutes", fontsize=17)
+    ax.set_ylabel("F1 Score", fontsize=17)
+    ax.tick_params(axis="both", labelsize=15)
+    ax.grid(True, linestyle="--", alpha=0.4)
+    ax.legend(fontsize=15)
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=200)
+    plt.close(fig)
+def _normalize_prop_names(prop_obj):
+    if isinstance(prop_obj, dict):
+        return {str(k) for k in prop_obj.keys() if str(k).strip()}
+    elif isinstance(prop_obj, (list, tuple, set)):
+        flat = []
+        for item in prop_obj:
+            if isinstance(item, (list, tuple, set)):
+                flat.extend(item)
+            else:
+                flat.append(item)
+        return {str(x) for x in flat if str(x).strip()}
+    elif prop_obj:
+        return {str(prop_obj)}
+    return set()
+def _complexity_by_props(folder, bins):
+    by_props = defaultdict(list)
+    for fpath in _iter_json(folder):
+        data = _safe_json_load(fpath)
+        pred = set(map(int, data.get("frames_of_interest", [])))
+        gt   = set(map(int, data.get("ground_truth", [])))
+        tp, fp, fn, _, _, f1_file = _per_file_stats(pred, gt)
+        prop_names = _normalize_prop_names(data.get("propositions", []))
+        n_props = len(prop_names)
+        if n_props in bins:
+            by_props[n_props].append(float(f1_file))
+    return by_props
+def plot_complexity(folders, labels, colors, bins, out_path):
+    all_by_props = [ _complexity_by_props(f, bins) for f in folders ]
+    width = 0.25
+    offsets = [-(width), 0.0, width]  # for three models
+    fig, ax = plt.subplots(figsize=(9, 6))
+    handles = []
+    for idx, (by_props, lab, col, off) in enumerate(zip(all_by_props, labels, colors, offsets)):
+        positions = [p + off for p in bins]
+        data = [by_props.get(k, []) for k in bins]
+        bp = ax.boxplot(
+            data,
+            positions=positions,
+            widths=width * 0.9,
+            patch_artist=True,
+            showfliers=False,
+        )
+        for box in bp['boxes']:
+            box.set_facecolor(col)
+            box.set_alpha(0.35)
+            box.set_edgecolor(col)
+            box.set_linewidth(1.5)
+        for element in ['whiskers', 'caps', 'medians']:
+            for artist in bp[element]:
+                artist.set_color(col)
+                artist.set_linewidth(1.5)
+        handles.append(bp["boxes"][0])
+    ax.set_xticks(bins)
+    ax.set_xticklabels([p for p in bins], fontsize=15)
+    ax.set_xlabel("Number of Propositions", fontsize=17)
+    ax.set_ylabel("F1 Score", fontsize=17)
+    ax.tick_params(axis="y", labelsize=15)
+    ax.grid(True, linestyle="--", alpha=0.4)
+    ax.legend(handles, labels, fontsize=13)
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=200)
+    plt.close(fig)
+if __name__ == "__main__":
+    folders = [folder1, folder2, folder3]
+    compute_statistics(folders)
+    plot_duration(folders, labels, colors, out_path_duration)
+    plot_complexity(folders, labels, colors, complexity_bins, out_path_complexity)

scripts/plots/plot_complexity.png ADDED Viewed

Git LFS Details

SHA256: a95c67486c6fdeb5a8ac94a484ee4c1bb79233c40fa285c2cb44ccf07e8f9b8f
Pointer size: 130 Bytes
Size of remote file: 77.8 kB

scripts/plots/plot_duration.png ADDED Viewed

Git LFS Details

SHA256: f5ea7015951a7132ac7b23b847484ceaf7723f471a3e254922f6aa7a1707193b
Pointer size: 131 Bytes
Size of remote file: 184 kB

vllm_serve.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+# MODEL="Qwen/Qwen2.5-VL-7B-Instruct"
+# MODEL="OpenGVLab/InternVL2-8B",
+# export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+# export NCCL_P2P_DISABLE=1
+# export CUDA_VISIBLE_DEVICES="0"
+# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+PORT=8000
+vllm serve "OpenGVLab/InternVL2-8B" \
+    --port 8000 \
+    --trust-remote-code \
+    --limit-mm-per-prompt image=4 \
+    # --max-model-len 8192 \
+    # --gpu-memory-utilization 0.97 \
+    --disable-log-requests