Syzygianinfern0 commited on
Commit
47875a1
·
0 Parent(s):

Initial clean commit for HF Spaces deployment with LFS

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
2
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ vendors/
2
+ output/
3
+ uv.lock
4
+
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ # VS Code
167
+ .vscode
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM syzygianinfern0/stormbase:latest
2
+
3
+ # Set up a new user named "user" with user ID 1000
4
+ RUN useradd -m -u 1000 user
5
+
6
+ # Switch to the "user" user
7
+ USER user
8
+
9
+ # Set home to the user's home directory
10
+ ENV HOME=/home/user \
11
+ PATH=/home/user/.local/bin:$PATH
12
+
13
+ # Set the working directory to the user's home directory
14
+ WORKDIR $HOME/app
15
+
16
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
17
+ COPY --chown=user . $HOME/app
18
+
19
+ # Expose Gradio port
20
+ EXPOSE 7860
21
+
22
+ # Run your Gradio app
23
+ CMD ["./launch_space.sh"]
Dockerfile.stormbase ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Start from a base image with CUDA and Python
2
+ FROM nvidia/cuda:12.8.1-base-ubuntu22.04
3
+
4
+ # System setup
5
+ ENV DEBIAN_FRONTEND=noninteractive
6
+
7
+ # Install system packages
8
+ RUN apt-get update && apt-get install -y \
9
+ python3 python3-pip python3-dev python3-venv \
10
+ git wget unzip cmake build-essential \
11
+ libboost-all-dev libginac-dev libglpk-dev \
12
+ m4 libcln-dev libgmp-dev automake libhwloc-dev \
13
+ libgl1 libglib2.0-0 && \
14
+ rm -rf /var/lib/apt/lists/*
15
+
16
+ # Set working directory
17
+ WORKDIR /app
18
+
19
+ # Copy requirements file
20
+ COPY requirements.txt .
21
+
22
+ # Upgrade pip and install dependencies
23
+ RUN pip install --upgrade pip && \
24
+ pip install -r requirements.txt
25
+
26
+ # ====== Precompile carl-storm ======
27
+ WORKDIR /opt
28
+ RUN git clone https://github.com/moves-rwth/carl-storm && \
29
+ cd carl-storm && \
30
+ mkdir build && cd build && \
31
+ cmake .. && make lib_carl
32
+
33
+ # ====== Precompile Storm ======
34
+ WORKDIR /opt
35
+ RUN wget https://github.com/moves-rwth/storm/archive/stable.zip && \
36
+ unzip stable.zip && \
37
+ cd storm-stable && \
38
+ mkdir build && cd build && \
39
+ cmake ../ -DCMAKE_BUILD_TYPE=Release \
40
+ -DSTORM_DEVELOPER=OFF \
41
+ -DSTORM_LOG_DISABLE_DEBUG=ON \
42
+ -DSTORM_PORTABLE=ON \
43
+ -DSTORM_USE_SPOT_SHIPPED=ON && \
44
+ make -j12
45
+
46
+ RUN pip install stormpy
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ # Neuro Symbolic Video Search with Temporal Logic (NSVS-TL)
4
+
5
+ [![arXiv](https://img.shields.io/badge/arXiv-2403.11021-b31b1b.svg)](https://arxiv.org/abs/2403.11021) [![Paper](https://img.shields.io/badge/Paper-pdf-green.svg)](https://link.springer.com/chapter/10.1007/978-3-031-73229-4_13) [![Website](https://img.shields.io/badge/ProjectWebpage-nsvs--tl-orange.svg)](https://utaustin-swarmlab.github.io/nsvs/) [![GitHub](https://img.shields.io/badge/Code-Source--Code-blue.svg)](https://github.com/UTAustin-SwarmLab/Neuro-Symbolic-Video-Search-Temporal-Logic) [![GitHub](https://img.shields.io/badge/Code-Dataset-blue.svg)](https://github.com/UTAustin-SwarmLab/Temporal-Logic-Video-Dataset)
6
+ </div>
7
+
8
+ ## Abstract
9
+
10
+ The unprecedented surge in video data production in recent years necessitates efficient tools to extract meaningful frames from videos for downstream tasks. Long-term temporal reasoning is a key desideratum for frame retrieval systems. While state-of-the-art foundation models, like VideoLLaMA and ViCLIP, are proficient in short-term semantic understanding, they surprisingly fail at long-term reasoning across frames. A key reason for this failure is that they intertwine per-frame perception and temporal reasoning into a single deep network. Hence, decoupling but co-designing the semantic understanding and temporal reasoning is essential for efficient scene identification. We propose a system that leverages vision-language models for semantic understanding of individual frames but effectively reasons about the long-term evolution of events using state machines and temporal logic (TL) formulae that inherently capture memory. Our TL-based reasoning improves the F1 score of complex event identification by 9-15% compared to benchmarks that use GPT-4 for reasoning on state-of-the-art self-driving datasets such as Waymo and NuScenes. The source code is available on Github.
11
+
12
+ ## Installation Guide
13
+ Ensure you have **CUDA 12.4** installed and available on your system.
14
+ On Linux, you can verify with:
15
+ ```bash
16
+ nvcc --version
17
+ ```
18
+
19
+ From the root of the repo, run the following to build all STORM dependencies:
20
+ ```bash
21
+ ./build_dependency
22
+ ```
23
+
24
+ Next, install uv:
25
+ ```bash
26
+ pip install uv
27
+ ```
28
+
29
+ Finally, install everything in `pyproject.toml` to build project dependencies:
30
+ ```bash
31
+ uv sync
32
+ ```
33
+
34
+
35
+ ## Running the System
36
+
37
+ NSVS can be run in two ways: running it with raw mp4 files and input queries or running it via the TLV dataset.
38
+
39
+ To run it with mp4 files, modify the mp4 file paths and the natural language search query inside `execute_with_mp4.py` and run it with:
40
+ ```bash
41
+ uv run execute_with_mp4
42
+ ```
43
+
44
+ To run it with the TLV dataset, first download the dataset from [GitHub](https://github.com/UTAustin-SwarmLab/Temporal-Logic-Video-Dataset). Then, specify the dataset path in `execute_with_tlv.py` and run the program:
45
+ ```bash
46
+ uv run execute_with_tlv
47
+ ```
48
+
49
+
50
+ ## Connect with Me
51
+
52
+ <p align="center">
53
+ <em>Feel free to connect with me through these professional channels:</em>
54
+ <p align="center">
55
+ <a href="https://www.linkedin.com/in/mchoi07/" target="_blank"><img src="https://img.shields.io/badge/-LinkedIn-0077B5?style=flat-square&logo=Linkedin&logoColor=white" alt="LinkedIn"/></a>
56
+ <a href="mailto:[email protected]"><img src="https://img.shields.io/badge/-Email-D14836?style=flat-square&logo=Gmail&logoColor=white" alt="Email"/></a>
57
+ <a href="https://scholar.google.com/citations?user=ai4daB8AAAAJ&hl" target="_blank"><img src="https://img.shields.io/badge/-Google%20Scholar-4285F4?style=flat-square&logo=google-scholar&logoColor=white" alt="Google Scholar"/></a>
58
+ <a href="https://minkyuchoi-07.github.io" target="_blank"><img src="https://img.shields.io/badge/-Website-00C7B7?style=flat-square&logo=Internet-Explorer&logoColor=white" alt="Website"/></a>
59
+ <a href="https://x.com/MinkyuChoi7" target="_blank"><img src="https://img.shields.io/badge/-Twitter-1DA1F2?style=flat-square&logo=Twitter&logoColor=white" alt="X"/></a>
60
+ </p>
61
+
62
+ ## Citation
63
+
64
+ If you find this repo useful, please cite our paper:
65
+
66
+ ```bibtex
67
+ @inproceedings{choi2024towards,
68
+ title={Towards neuro-symbolic video understanding},
69
+ author={Choi, Minkyu and Goel, Harsh and Omama, Mohammad and Yang, Yunhao and Shah, Sahil and Chinchali, Sandeep},
70
+ booktitle={European Conference on Computer Vision},
71
+ pages={220--236},
72
+ year={2024},
73
+ organization={Springer}
74
+ }
75
+ ```
build_dependency.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ REPO_DIR="$(pwd)"
5
+ VENDORS_DIR="$REPO_DIR/vendors"
6
+ INSTALL_PREFIX="$VENDORS_DIR/install"
7
+
8
+ mkdir -p "$VENDORS_DIR"
9
+ cd "$VENDORS_DIR"
10
+
11
+ # carl-storm
12
+ cd "$VENDORS_DIR"
13
+ if [ ! -d "carl-storm" ]; then
14
+ git clone https://github.com/moves-rwth/carl-storm
15
+ fi
16
+ cmake -S carl-storm -B carl-storm/build \
17
+ -DCMAKE_BUILD_TYPE=Release \
18
+ -DCMAKE_INSTALL_PREFIX="$INSTALL_PREFIX"
19
+ cmake --build carl-storm/build -j"$(nproc)" --target lib_carl
20
+ cmake --build carl-storm/build --target install
21
+
22
+ # storm-stable
23
+ if [ ! -d "storm-stable" ]; then
24
+ git clone --branch stable --depth 1 --recursive https://github.com/moves-rwth/storm.git storm-stable
25
+ fi
26
+ cmake -S storm-stable -B storm-stable/build \
27
+ -DCMAKE_BUILD_TYPE=Release \
28
+ -DCMAKE_INSTALL_PREFIX="$INSTALL_PREFIX" \
29
+ -DSTORM_DEVELOPER=OFF \
30
+ -DSTORM_LOG_DISABLE_DEBUG=ON \
31
+ -DSTORM_PORTABLE=ON \
32
+ -DSTORM_USE_SPOT_SHIPPED=ON
33
+ cmake --build storm-stable/build -j"$(nproc)"
34
+ cmake --build storm-stable/build --target install
35
+
36
+ export CMAKE_ARGS="-DCMAKE_POLICY_VERSION_MINIMUM=3.5"
37
+ export STORM_DIR_HINT="$INSTALL_PREFIX"
38
+ export CARL_DIR_HINT="$INSTALL_PREFIX"
39
+ unset CMAKE_ARGS || true
40
+
demo_videos/blue_shirt.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afdac2b4ef3d815ccc8beb676bcccfb49245e52e4e8fe298f9fd32e7b2a1651d
3
+ size 3741328
demo_videos/car.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1cfbd984f956ba5dc70f6d76009d65f5fd9ecc26686233cff3cc4860c880025
3
+ size 7408741
demo_videos/dog_jump.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ddb11ae2c86cef81311b3c753e30447828fce6086025b01ba2893621365739b
3
+ size 5096729
demo_videos/teaser-gen3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af319b324939c6a03cae461ca9da1e28fa550cc54ef9ac3a12d9525a1f309e48
3
+ size 4421653
demo_videos/teaser-pika.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:518ccf98b32c4fbdcc49ca686c4c42d1f8f632a835c72da1dc9104f08d8439f5
3
+ size 1560341
execute_demo.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import uuid
4
+ import cv2
5
+ import subprocess
6
+ import numpy as np
7
+ import gradio as gr
8
+ import tempfile
9
+ from typing import Dict, List, Iterable, Tuple
10
+
11
+ from ns_vfs.video.read_mp4 import Mp4Reader
12
+ from execute_with_mp4 import process_entry
13
+
14
+
15
+ def _load_entry_from_reader(video_path, query_text):
16
+ reader = Mp4Reader(
17
+ [{"path": video_path, "query": query_text}],
18
+ openai_save_path="",
19
+ sampling_rate_fps=0.5
20
+ )
21
+ data = reader.read_video()
22
+ if not data:
23
+ raise RuntimeError("No data returned by Mp4Reader (check video path)")
24
+ return data[0]
25
+
26
+
27
+ def _make_empty_video(path, width=320, height=240, fps=1.0):
28
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
29
+ writer = cv2.VideoWriter(path, fourcc, fps, (width, height))
30
+ frame = np.zeros((height, width, 3), dtype=np.uint8)
31
+ writer.write(frame)
32
+ writer.release()
33
+ return path
34
+
35
+
36
+ def _crop_video_ffmpeg(input_path, output_path, frame_indices, prop_matrix):
37
+ if len(frame_indices) == 0:
38
+ cap = cv2.VideoCapture(str(input_path))
39
+ if not cap.isOpened():
40
+ raise RuntimeError(f"Could not open video: {input_path}")
41
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
42
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
43
+ cap.release()
44
+ _make_empty_video(output_path, width, height, fps=1.0)
45
+ return
46
+
47
+ def group_into_ranges(frames):
48
+ if not frames:
49
+ return []
50
+ frames = sorted(set(frames))
51
+ ranges = []
52
+ start = prev = frames[0]
53
+ for f in frames[1:]:
54
+ if f == prev + 1:
55
+ prev = f
56
+ else:
57
+ ranges.append((start, prev + 1)) # end-exclusive
58
+ start = prev = f
59
+ ranges.append((start, prev + 1))
60
+ return ranges
61
+
62
+ ranges = group_into_ranges(frame_indices)
63
+ filters = []
64
+ labels = []
65
+ for i, (start, end) in enumerate(ranges):
66
+ filters.append(
67
+ f"[0:v]trim=start_frame={start}:end_frame={end},setpts=PTS-STARTPTS[v{i}]"
68
+ )
69
+ labels.append(f"[v{i}]")
70
+ filters.append(f"{''.join(labels)}concat=n={len(ranges)}:v=1:a=0[outv]")
71
+
72
+ cmd = [
73
+ "ffmpeg", "-y", "-i", input_path,
74
+ "-filter_complex", "; ".join(filters),
75
+ "-map", "[outv]",
76
+ "-c:v", "libx264", "-preset", "fast", "-crf", "23",
77
+ output_path,
78
+ ]
79
+ subprocess.run(cmd, check=True)
80
+
81
+
82
+ def _crop_video(input_path: str, output_path: str, frame_indices: List[int], prop_matrix: Dict[str, List[int]]):
83
+ input_path = str(input_path)
84
+ output_path = str(output_path)
85
+
86
+ # Probe width/height/fps
87
+ cap = cv2.VideoCapture(input_path)
88
+ if not cap.isOpened():
89
+ raise RuntimeError(f"Could not open video: {input_path}")
90
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
91
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
92
+ fps = float(cap.get(cv2.CAP_PROP_FPS)) or 0.0
93
+ cap.release()
94
+ if fps <= 0:
95
+ fps = 30.0
96
+
97
+ # If nothing to write, emit a 1-frame empty video
98
+ if not frame_indices:
99
+ from numpy import zeros, uint8
100
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
101
+ out = cv2.VideoWriter(output_path, fourcc, 1.0, (width, height))
102
+ out.write(zeros((height, width, 3), dtype=uint8))
103
+ out.release()
104
+ return
105
+
106
+ # Helper: group consecutive integers into (start, end_exclusive)
107
+ def _group_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
108
+ f = sorted(set(int(x) for x in frames))
109
+ if not f:
110
+ return []
111
+ out = []
112
+ s = p = f[0]
113
+ for x in f[1:]:
114
+ if x == p + 1:
115
+ p = x
116
+ else:
117
+ out.append((s, p + 1))
118
+ s = p = x
119
+ out.append((s, p + 1))
120
+ return out
121
+
122
+ # Invert prop_matrix to {frame_idx: sorted [props]}
123
+ props_by_frame: Dict[int, List[str]] = {}
124
+ for prop, frames in (prop_matrix or {}).items():
125
+ for fi in frames:
126
+ fi = int(fi)
127
+ props_by_frame.setdefault(fi, []).append(prop)
128
+ for fi in list(props_by_frame.keys()):
129
+ props_by_frame[fi] = sorted(set(props_by_frame[fi]))
130
+
131
+ # Only subtitle frames we will output
132
+ fi_set = set(int(x) for x in frame_indices)
133
+ frames_with_labels = sorted(fi for fi in fi_set if props_by_frame.get(fi))
134
+
135
+ # Compress consecutive frames that share the same label set
136
+ grouped_label_spans: List[Tuple[int, int, Tuple[str, ...]]] = []
137
+ prev_f = None
138
+ prev_labels: Tuple[str, ...] = ()
139
+ span_start = None
140
+ for f in frames_with_labels:
141
+ labels = tuple(props_by_frame.get(f, []))
142
+ if prev_f is None:
143
+ span_start, prev_f, prev_labels = f, f, labels
144
+ elif (f == prev_f + 1) and (labels == prev_labels):
145
+ prev_f = f
146
+ else:
147
+ grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
148
+ span_start, prev_f, prev_labels = f, f, labels
149
+ if prev_f is not None and prev_labels:
150
+ grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
151
+
152
+ # Build ASS subtitle file (top-right)
153
+ def ass_time(t_sec: float) -> str:
154
+ cs = int(round(t_sec * 100))
155
+ h = cs // (100 * 3600)
156
+ m = (cs // (100 * 60)) % 60
157
+ s = (cs // 100) % 60
158
+ cs = cs % 100
159
+ return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
160
+
161
+ def make_ass(width: int, height: int) -> str:
162
+ lines = []
163
+ lines.append("[Script Info]")
164
+ lines.append("ScriptType: v4.00+")
165
+ lines.append("ScaledBorderAndShadow: yes")
166
+ lines.append(f"PlayResX: {width}")
167
+ lines.append(f"PlayResY: {height}")
168
+ lines.append("")
169
+ lines.append("[V4+ Styles]")
170
+ lines.append("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
171
+ "Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, "
172
+ "Shadow, Alignment, MarginL, MarginR, MarginV, Encoding")
173
+ # Font size 18 per your request; Alignment=9 (top-right)
174
+ lines.append("Style: Default,DejaVu Sans,18,&H00FFFFFF,&H000000FF,&H00000000,&H64000000,"
175
+ "0,0,0,0,100,100,0,0,1,2,0.8,9,16,16,16,1")
176
+ lines.append("")
177
+ lines.append("[Events]")
178
+ lines.append("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text")
179
+
180
+ for start_f, end_f, labels in grouped_label_spans:
181
+ if not labels:
182
+ continue
183
+ start_t = ass_time(start_f / fps)
184
+ end_t = ass_time(end_f / fps)
185
+ text = r"\N".join(labels) # stacked lines
186
+ lines.append(f"Dialogue: 0,{start_t},{end_t},Default,,0,0,0,,{text}")
187
+
188
+ return "\n".join(lines)
189
+
190
+ tmp_dir = tempfile.mkdtemp(prefix="props_ass_")
191
+ ass_path = os.path.join(tmp_dir, "props.ass")
192
+ with open(ass_path, "w", encoding="utf-8") as f:
193
+ f.write(make_ass(width, height))
194
+
195
+ # Build trim/concat ranges from requested frame_indices
196
+ ranges = _group_ranges(frame_indices)
197
+
198
+ # Filtergraph with burned subtitles then trim/concat
199
+ split_labels = [f"[s{i}]" for i in range(len(ranges))] if ranges else []
200
+ out_labels = [f"[v{i}]" for i in range(len(ranges))] if ranges else []
201
+
202
+ filters = []
203
+ ass_arg = ass_path.replace("\\", "\\\\")
204
+ filters.append(f"[0:v]subtitles='{ass_arg}'[sub]")
205
+
206
+ if len(ranges) == 1:
207
+ s0, e0 = ranges[0]
208
+ filters.append(f"[sub]trim=start_frame={s0}:end_frame={e0},setpts=PTS-STARTPTS[v0]")
209
+ else:
210
+ if ranges:
211
+ filters.append(f"[sub]split={len(ranges)}{''.join(split_labels)}")
212
+ for i, (s, e) in enumerate(ranges):
213
+ filters.append(f"{split_labels[i]}trim=start_frame={s}:end_frame={e},setpts=PTS-STARTPTS{out_labels[i]}")
214
+
215
+ if ranges:
216
+ filters.append(f"{''.join(out_labels)}concat=n={len(ranges)}:v=1:a=0[outv]")
217
+
218
+ filter_complex = "; ".join(filters)
219
+
220
+ cmd = [
221
+ "ffmpeg", "-y",
222
+ "-i", input_path,
223
+ "-filter_complex", filter_complex,
224
+ "-map", "[outv]" if ranges else "[sub]",
225
+ "-c:v", "libx264", "-preset", "fast", "-crf", "23",
226
+ output_path,
227
+ ]
228
+ try:
229
+ subprocess.run(cmd, check=True)
230
+ finally:
231
+ try:
232
+ os.remove(ass_path)
233
+ os.rmdir(tmp_dir)
234
+ except OSError:
235
+ pass
236
+
237
+
238
+ def _format_prop_ranges(prop_matrix: Dict[str, List[int]]) -> str:
239
+ def group_into_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
240
+ f = sorted(set(int(x) for x in frames))
241
+ if not f:
242
+ return []
243
+ ranges: List[Tuple[int, int]] = []
244
+ s = p = f[0]
245
+ for x in f[1:]:
246
+ if x == p + 1:
247
+ p = x
248
+ else:
249
+ ranges.append((s, p)) # inclusive end for display
250
+ s = p = x
251
+ ranges.append((s, p))
252
+ return ranges
253
+
254
+ if not prop_matrix:
255
+ return "No propositions detected."
256
+
257
+ lines = []
258
+ for prop, frames in prop_matrix.items():
259
+ ranges = group_into_ranges(frames)
260
+ pretty = prop.replace("_", " ").title()
261
+ if not ranges:
262
+ lines.append(f"{pretty}: —")
263
+ continue
264
+ parts = [f"{a}" if a == b else f"{a}-{b}" for (a, b) in ranges]
265
+ lines.append(f"{pretty}: {', '.join(parts)}")
266
+ return "\n".join(lines)
267
+
268
+
269
+ # -----------------------------
270
+ # Gradio handler
271
+ # -----------------------------
272
+ def run_pipeline(input_video, mode, query_text, propositions_json, specification_text):
273
+ """
274
+ Returns: (cropped_video_path, prop_ranges_text, tl_text)
275
+ """
276
+
277
+ def _err(msg, width=320, height=240): # keep outputs shape consistent
278
+ tmp_out = os.path.join("/tmp", f"empty_{uuid.uuid4().hex}.mp4")
279
+ _make_empty_video(tmp_out, width=width, height=height, fps=1.0)
280
+ return (
281
+ tmp_out,
282
+ "No propositions detected.",
283
+ f"Error: {msg}"
284
+ )
285
+
286
+ # Resolve video path
287
+ if isinstance(input_video, dict) and "name" in input_video:
288
+ video_path = input_video["name"]
289
+ elif isinstance(input_video, str):
290
+ video_path = input_video
291
+ else:
292
+ return _err("Please provide a video.")
293
+
294
+ # Build entry
295
+ if mode == "Natural language query":
296
+ if not query_text or not query_text.strip():
297
+ return _err("Please enter a query.")
298
+ entry = _load_entry_from_reader(video_path, query_text)
299
+ else:
300
+ if not (propositions_json and propositions_json.strip()) or not (specification_text and specification_text.strip()):
301
+ return _err("Please provide both Propositions (array) and Specification.")
302
+ entry = _load_entry_from_reader(video_path, "dummy-query")
303
+ try:
304
+ props = json.loads(propositions_json)
305
+ if not isinstance(props, list):
306
+ return _err("Propositions must be a JSON array.")
307
+ except Exception as e:
308
+ return _err(f"Failed to parse propositions JSON: {e}")
309
+ entry["tl"] = {
310
+ "propositions": props,
311
+ "specification": specification_text
312
+ }
313
+
314
+ # Compute FOI
315
+ try:
316
+ foi, prop_matrix = process_entry(entry) # list of frame indices & {prop: [frames]}
317
+ print(foi)
318
+ print(prop_matrix)
319
+ except Exception as e:
320
+ return _err(f"Processing error: {e}")
321
+
322
+ # Write cropped video
323
+ try:
324
+ out_path = os.path.join("/tmp", f"cropped_{uuid.uuid4().hex}.mp4")
325
+ _crop_video(video_path, out_path, foi, prop_matrix)
326
+ print(f"Wrote cropped video to: {out_path}")
327
+ except Exception as e:
328
+ return _err(f"Failed to write cropped video: {e}")
329
+
330
+ # Build right-side text sections
331
+ prop_ranges_text = _format_prop_ranges(prop_matrix)
332
+ tl_text = (
333
+ f"Propositions: {json.dumps(entry['tl']['propositions'], ensure_ascii=False)}\n"
334
+ f"Specification: {entry['tl']['specification']}"
335
+ )
336
+ return out_path, prop_ranges_text, tl_text
337
+
338
+
339
+ # -----------------------------
340
+ # UI
341
+ # -----------------------------
342
+ with gr.Blocks(css="""
343
+ #io-col {display: flex; gap: 1rem;}
344
+ #left {flex: 1;}
345
+ #right {flex: 1;}
346
+ """, title="NSVS-TL") as demo:
347
+
348
+ gr.Markdown("# Neuro-Symbolic Visual Search with Temporal Logic")
349
+ gr.Markdown(
350
+ "Upload a video and either provide a natural-language **Query** *or* directly supply **Propositions** (array) + **Specification**. "
351
+ "On the right, you'll get a **cropped video** containing only the frames of interest, a **Propositions by Frames** summary, and the combined TL summary."
352
+ )
353
+
354
+ with gr.Row(elem_id="io-col"):
355
+ with gr.Column(elem_id="left"):
356
+ mode = gr.Radio(
357
+ choices=["Natural language query", "Props/Spec"],
358
+ value="Natural language query",
359
+ label="Input mode"
360
+ )
361
+ video = gr.Video(label="Upload Video")
362
+
363
+ query = gr.Textbox(
364
+ label="Query (natural language)",
365
+ placeholder="e.g., a man is jumping and panting until he falls down"
366
+ )
367
+
368
+ propositions = gr.Textbox(
369
+ label="Propositions (JSON array)",
370
+ placeholder='e.g., ["man_jumps", "man_pants", "man_falls_down"]',
371
+ lines=4,
372
+ visible=False
373
+ )
374
+ specification = gr.Textbox(
375
+ label="Specification",
376
+ placeholder='e.g., ("woman_jumps" & "woman_claps") U "candle_is_blown"',
377
+ visible=False
378
+ )
379
+
380
+ def _toggle_fields(m):
381
+ if m == "Natural language query":
382
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
383
+ else:
384
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
385
+
386
+ mode.change(_toggle_fields, inputs=[mode], outputs=[query, propositions, specification])
387
+
388
+ run_btn = gr.Button("Run", variant="primary")
389
+
390
+ gr.Examples(
391
+ label="Examples (dummy paths + queries)",
392
+ examples=[
393
+ ["demo_videos/dog_jump.mp4", "a dog jumps until a red tube is in view"],
394
+ ["demo_videos/blue_shirt.mp4", "a girl in a green shirt until a candle is blown"],
395
+ ["demo_videos/car.mp4", "red car until a truck"]
396
+ ],
397
+ inputs=[video, query],
398
+ cache_examples=False
399
+ )
400
+
401
+ with gr.Column(elem_id="right"):
402
+ cropped_video = gr.Video(label="Cropped Video (Frames of Interest Only)")
403
+
404
+ prop_ranges_out = gr.Textbox(
405
+ label="Propositions by Frames",
406
+ lines=6,
407
+ interactive=False
408
+ )
409
+
410
+ tl_out = gr.Textbox(
411
+ label="TL (Propositions & Specification)",
412
+ lines=8,
413
+ interactive=False
414
+ )
415
+
416
+ run_btn.click(
417
+ fn=run_pipeline,
418
+ inputs=[video, mode, query, propositions, specification],
419
+ outputs=[cropped_video, prop_ranges_out, tl_out]
420
+ )
421
+
422
+ if __name__ == "__main__":
423
+ demo.launch(server_name="0.0.0.0", server_port=7860)
424
+
execute_with_mp4.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ import itertools
3
+ import operator
4
+ import json
5
+ import time
6
+ import os
7
+
8
+ from ns_vfs.nsvs import run_nsvs
9
+ from ns_vfs.video.read_mp4 import Mp4Reader
10
+
11
+
12
+ VIDEOS = [
13
+ {
14
+ "path": "demo_videos/blue_shirt.mp4",
15
+ "query": "a woman is jumping and clapping until a candle is blown"
16
+ }
17
+ ]
18
+ DEVICE = 7 # GPU device index
19
+ OPENAI_SAVE_PATH = ""
20
+ OUTPUT_DIR = "output"
21
+
22
+ def fill_in_frame_count(arr, entry):
23
+ scale = (entry["video_info"].fps) / (entry["metadata"]["sampling_rate_fps"])
24
+
25
+ runs = []
26
+ for _, grp in itertools.groupby(sorted(arr), key=lambda x, c=[0]: (x - (c.__setitem__(0, c[0]+1) or c[0]))):
27
+ g = list(grp)
28
+ runs.append((g[0], g[-1]))
29
+
30
+ real = []
31
+ for start_i, end_i in runs:
32
+ a = int(round(start_i * scale))
33
+ b = int(round(end_i * scale))
34
+ if real and a <= real[-1]:
35
+ a = real[-1] + 1
36
+ real.extend(range(a, b + 1))
37
+ return real
38
+
39
+ def process_entry(entry):
40
+ foi, object_frame_dict = run_nsvs(
41
+ frames=entry['images'],
42
+ proposition=entry['tl']['propositions'],
43
+ specification=entry['tl']['specification'],
44
+ model_name="InternVL2-8B",
45
+ device=DEVICE
46
+ )
47
+
48
+ foi = fill_in_frame_count([i for sub in foi for i in sub], entry)
49
+ object_frame_dict = {key: fill_in_frame_count(value, entry) for key, value in object_frame_dict.items()}
50
+ return foi, object_frame_dict
51
+
52
+ def main():
53
+ reader = Mp4Reader(VIDEOS, OPENAI_SAVE_PATH, sampling_rate_fps=1)
54
+ data = reader.read_video()
55
+ if not data:
56
+ return
57
+
58
+ with tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
59
+ for i, entry in pbar:
60
+ start_time = time.time()
61
+ foi = process_entry(entry)
62
+ end_time = time.time()
63
+ processing_time = round(end_time - start_time, 3)
64
+
65
+ if foi:
66
+ output = {
67
+ "tl": entry["tl"],
68
+ "metadata": entry["metadata"],
69
+ "video_info": entry["video_info"].to_dict(),
70
+ "frames_of_interest": foi,
71
+ "processting_time_seconds": processing_time
72
+ }
73
+
74
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
75
+ with open(os.path.join(OUTPUT_DIR, f"output_{i}.json"), "w") as f:
76
+ json.dump(output, f, indent=4)
77
+
78
+ if __name__ == "__main__":
79
+ main()
execute_with_tlv.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ import json
3
+ import time
4
+ import os
5
+
6
+ from ns_vfs.nsvs import run_nsvs
7
+ from ns_vfs.video.read_tlv import TLVReader
8
+
9
+
10
+ TLV_PATH = "/nas/dataset/tlv-dataset-v1"
11
+ DEVICE = 7 # GPU device index
12
+ OUTPUT_DIR = "output"
13
+
14
+ def process_entry(entry):
15
+ foi = run_nsvs(
16
+ frames=entry['images'],
17
+ proposition=entry['tl']['propositions'],
18
+ specification=entry['tl']['specification'],
19
+ model_name="InternVL2-8B",
20
+ device=DEVICE
21
+ )
22
+ return foi
23
+
24
+ def main():
25
+ reader = TLVReader(TLV_PATH)
26
+ data = reader.read_video()
27
+ if not data:
28
+ return
29
+
30
+ with tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
31
+ for i, entry in pbar:
32
+ start_time = time.time()
33
+ foi = process_entry(entry)
34
+ end_time = time.time()
35
+ processing_time = round(end_time - start_time, 3)
36
+
37
+ if foi:
38
+ output = {
39
+ "tl": entry["tl"],
40
+ "metadata": entry["metadata"],
41
+ "video_info": entry["video_info"].to_dict(),
42
+ "frames_of_interest": foi,
43
+ "processting_time_seconds": processing_time
44
+ }
45
+
46
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
47
+ with open(os.path.join(OUTPUT_DIR, f"output_{i}.json"), "w") as f:
48
+ json.dump(output, f, indent=4)
49
+
50
+ if __name__ == "__main__":
51
+ main()
launch_space.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ apt update
4
+ apt install -y ffmpeg
5
+
6
+ # Start vLLM server in background
7
+ ./vllm_serve.sh &
8
+
9
+ # Wait briefly to ensure vLLM is up before Gradio tries to connect
10
+ sleep 60
11
+
12
+ # Display fancy startup message
13
+ echo "
14
+ ╔════════════════════════════════════════════════════════════════╗
15
+ ║ ║
16
+ ║ 🚀 Gradio Space Starting! 🚀 ║
17
+ ║ ║
18
+ ╚════════════════════════════════════════════════════════════════╝
19
+ "
20
+
21
+ # Start Gradio app
22
+ python3 execute_demo.py
ns_vfs/model_checker/__init__.py ADDED
File without changes
ns_vfs/model_checker/frame_validator.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import enum
3
+
4
+ from ns_vfs.video.frame import VideoFrame
5
+
6
+ class SymbolicFilterRule(enum.Enum):
7
+ AVOID_PROPS = "avoid"
8
+ ASSOCIATED_PROPS = "associated"
9
+
10
+ class FrameValidator:
11
+ def __init__(
12
+ self,
13
+ ltl_formula: str,
14
+ threshold_of_probability: float = 0.5,
15
+ ):
16
+ self.threshold_of_probability = threshold_of_probability
17
+
18
+ ltl_formula = ltl_formula[ltl_formula.find('[') + 1:ltl_formula.rfind(']')]
19
+ if " U " in ltl_formula:
20
+ rule_1 = self.get_symbolic_rule_from_ltl_formula(ltl_formula.split(" U ")[0])
21
+ rule_2 = self.get_symbolic_rule_from_ltl_formula(ltl_formula.split(" U ")[1])
22
+ self.symbolic_verification_rule = {
23
+ SymbolicFilterRule.ASSOCIATED_PROPS: rule_1[SymbolicFilterRule.ASSOCIATED_PROPS] + rule_2[SymbolicFilterRule.ASSOCIATED_PROPS],
24
+ SymbolicFilterRule.AVOID_PROPS: rule_1[SymbolicFilterRule.AVOID_PROPS] or rule_2[SymbolicFilterRule.AVOID_PROPS],
25
+ }
26
+ else:
27
+ self.symbolic_verification_rule = self.get_symbolic_rule_from_ltl_formula(ltl_formula)
28
+
29
+ def validate_frame(
30
+ self,
31
+ frame: VideoFrame,
32
+ ):
33
+ """Validate frame."""
34
+ thresholded_objects = frame.thresholded_detected_objects(self.threshold_of_probability)
35
+ if len(thresholded_objects) > 0:
36
+ return self.symbolic_verification(frame)
37
+ else:
38
+ return False
39
+
40
+ def symbolic_verification(self, frame: VideoFrame):
41
+ """Symbolic verification."""
42
+ avoid_props = self.symbolic_verification_rule.get(SymbolicFilterRule.AVOID_PROPS)
43
+ if avoid_props:
44
+ for prop in frame.object_of_interest.keys():
45
+ if frame.object_of_interest[prop].get_detected_probability() >= self.threshold_of_probability and prop in avoid_props: # detected but also in avoid_props
46
+ return False
47
+
48
+ associated_props = self.symbolic_verification_rule.get(SymbolicFilterRule.ASSOCIATED_PROPS)
49
+ for group in associated_props:
50
+ bad = 0
51
+ total = 0
52
+ for prop in group:
53
+ total += 1
54
+ if frame.object_of_interest[prop].get_detected_probability() < self.threshold_of_probability:
55
+ bad += 1
56
+ if total > 2 * bad:
57
+ return True
58
+ return False
59
+
60
+ def get_symbolic_rule_from_ltl_formula(self, ltl_formula: str) -> dict:
61
+ symbolic_verification_rule = {}
62
+
63
+ if "!" in ltl_formula:
64
+ match = re.search(r'(?<!\w)!\s*(?:\((.*?)\)|([^\s\)]+))', ltl_formula)
65
+ avoid_tl = (match.group(1) or match.group(2)).strip()
66
+ symbolic_verification_rule[SymbolicFilterRule.AVOID_PROPS] = avoid_tl
67
+ else:
68
+ symbolic_verification_rule[SymbolicFilterRule.AVOID_PROPS] = None
69
+
70
+ ltl_formula = re.sub(r"[!GF]", "", ltl_formula.strip())
71
+ while ltl_formula.startswith("(") and ltl_formula.endswith(")") and ltl_formula.count("(") == ltl_formula.count(")"):
72
+ ltl_formula = ltl_formula[1:-1].strip()
73
+
74
+ split_and_clean = lambda expr: [re.sub(r"[()]", "", p).strip() for p in re.split(r"\s*&\s*", expr) if p.strip()]
75
+
76
+ match = re.search(r'\b( U |F)\b', ltl_formula)
77
+ if match:
78
+ idx = match.start()
79
+ associated = [split_and_clean(ltl_formula[:idx]), split_and_clean(ltl_formula[idx + len(match.group(1)):])]
80
+ else:
81
+ associated = [split_and_clean(ltl_formula)]
82
+ associated = [[s.strip('"') for s in sublist] for sublist in associated]
83
+ symbolic_verification_rule[SymbolicFilterRule.ASSOCIATED_PROPS] = associated
84
+
85
+ return symbolic_verification_rule
86
+
ns_vfs/model_checker/property_checker.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ns_vfs.model_checker.stormpy import StormModelChecker
2
+ from ns_vfs.model_checker.frame_validator import FrameValidator
3
+
4
+ class PropertyChecker:
5
+ def __init__(self, proposition, specification, model_type, tl_satisfaction_threshold, detection_threshold):
6
+ self.proposition = proposition
7
+ self.tl_satisfaction_threshold = tl_satisfaction_threshold
8
+ self.specification = self.generate_specification(specification)
9
+ self.model_type = model_type
10
+ self.detection_threshold = detection_threshold
11
+
12
+ self.model_checker = StormModelChecker(
13
+ proposition_set=self.proposition,
14
+ ltl_formula=self.specification
15
+ )
16
+ self.frame_validator = FrameValidator(
17
+ ltl_formula=self.specification,
18
+ threshold_of_probability=self.detection_threshold
19
+ )
20
+
21
+ def generate_specification(self, specification_raw):
22
+ return f'P>={self.tl_satisfaction_threshold:.2f} [ {specification_raw} ]'
23
+
24
+ def validate_frame(self, frame_of_interest):
25
+ return self.frame_validator.validate_frame(frame_of_interest)
26
+
27
+ def check_automaton(self, automaton):
28
+ return self.model_checker.check_automaton(
29
+ transitions=automaton.transitions,
30
+ states=automaton.states,
31
+ model_type=self.model_type
32
+ )
33
+
34
+ def validate_tl_specification(self, specification):
35
+ return self.model_checker.validate_tl_specification(specification)
36
+
37
+
38
+
ns_vfs/model_checker/stormpy.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+
4
+ import numpy as np
5
+ import stormpy
6
+ import stormpy.examples.files
7
+ from stormpy import ExplicitQualitativeCheckResult
8
+
9
+ from ns_vfs.model_checker.video_state import VideoState
10
+
11
+
12
+ class StormModelChecker:
13
+ """Model Checker using Stormpy for verifying properties."""
14
+
15
+ def __init__(
16
+ self,
17
+ proposition_set: list[str],
18
+ ltl_formula: str,
19
+ ) -> None:
20
+ """Initialize the StormModelChecker.
21
+
22
+ Args:
23
+ proposition_set: List of propositions.
24
+ ltl_formula: LTL formula to check.
25
+ verbose: Enable verbose output.
26
+ is_filter: Apply filtering to results.
27
+ """
28
+ self.proposition_set = proposition_set
29
+ self.ltl_formula = ltl_formula
30
+
31
+ def create_model(
32
+ self,
33
+ transitions: list[tuple[int, int, float]],
34
+ states: list[VideoState],
35
+ model_type: str = "sparse_ma",
36
+ ) -> any:
37
+ """Create model.
38
+
39
+ Args:
40
+ transitions (list[tuple[int, int, float]]): List of transitions.
41
+ states (list[VideoState]): List of states.
42
+ model_type (str): Type of model to create ("sparse_ma" or "dtmc").
43
+ verbose (bool): Whether to print verbose output.
44
+ """
45
+ state_labeling = self._build_label_func(states, self.proposition_set)
46
+ if model_type in ["sparse_ma", "mdp"]:
47
+ transition_matrix = self._build_trans_matrix(
48
+ transitions=transitions,
49
+ states=states,
50
+ model_type="nondeterministic",
51
+ )
52
+ else:
53
+ transition_matrix = self._build_trans_matrix(
54
+ transitions=transitions,
55
+ states=states,
56
+ model_type="deterministic",
57
+ )
58
+ components = stormpy.SparseModelComponents(
59
+ transition_matrix=transition_matrix,
60
+ state_labeling=state_labeling,
61
+ )
62
+ if model_type == "sparse_ma":
63
+ markovian_states = stormpy.BitVector(len(states), list(range(len(states))))
64
+ components.markovian_states = markovian_states
65
+ components.exit_rates = [1.0 for _ in range(len(states))]
66
+ model = stormpy.SparseMA(components)
67
+ elif model_type == "dtmc":
68
+ model = stormpy.storage.SparseDtmc(components)
69
+ elif model_type == "mdp":
70
+ model = stormpy.storage.SparseMdp(components)
71
+ else:
72
+ msg = f"Unsupported model type: {model_type}"
73
+ raise ValueError(msg)
74
+ return model
75
+
76
+ def check_automaton(
77
+ self,
78
+ transitions: list[tuple[int, int, float]],
79
+ states: list[VideoState],
80
+ model_type: str = "sparse_ma"
81
+ ) -> any:
82
+ """Check automaton.
83
+
84
+ Args:
85
+ transitions: List of transitions.
86
+ states: List of states.
87
+ verbose: Enable verbose output.
88
+ use_filter: Apply filtering to results.
89
+ """
90
+ model = self.create_model(
91
+ transitions=transitions,
92
+ states=states,
93
+ model_type=model_type,
94
+ )
95
+
96
+ # Define Properties
97
+ properties = stormpy.parse_properties_without_context(self.ltl_formula,)
98
+
99
+ # Get Result and Filter it
100
+ result = stormpy.model_checking(model, properties[0])
101
+
102
+ return self.qualitative_result_eval(result)
103
+
104
+ def qualitative_result_eval(self, verification_result: ExplicitQualitativeCheckResult) -> bool:
105
+ if isinstance(verification_result, ExplicitQualitativeCheckResult):
106
+ # string result is "true" when is absolutely true
107
+ # but it returns "true, false" when we have some true and false
108
+ verification_result_str = str(verification_result)
109
+ string_result = verification_result_str.split("{")[-1].split("}")[0]
110
+ if len(string_result) == 4:
111
+ if string_result[0] == "t": # 0,6
112
+ result = True
113
+ elif len(string_result) > 5:
114
+ # "true, false" -> some true and some false
115
+ result = True
116
+ else:
117
+ result = False
118
+ return result
119
+ msg = "Model Checking is not qualitative"
120
+ raise ValueError(msg)
121
+
122
+ def _build_trans_matrix(
123
+ self,
124
+ transitions: list[tuple[int, int, float]],
125
+ states: list[VideoState],
126
+ model_type: str = "nondeterministic",
127
+ ) -> stormpy.storage.SparseMatrix:
128
+ """Build transition matrix.
129
+
130
+ Args:
131
+ transitions: List of transitions.
132
+ states: List of states.
133
+ model_type: Type of model ("nondeterministic" or "deterministic").
134
+ """
135
+ if model_type not in ["nondeterministic", "deterministic"]:
136
+ msg = "Invalid model_type. Must be 'nondeterministic' or 'deterministic'"
137
+ raise ValueError(msg)
138
+
139
+ if model_type == "nondeterministic":
140
+ matrix = np.zeros((len(states), len(states)))
141
+ for t in transitions:
142
+ matrix[int(t[0]), int(t[1])] = float(t[2])
143
+ trans_matrix = stormpy.build_sparse_matrix(matrix, list(range(len(states))))
144
+
145
+ elif model_type == "deterministic":
146
+ num_states = len(states)
147
+ builder = stormpy.SparseMatrixBuilder(
148
+ rows=num_states,
149
+ columns=num_states,
150
+ entries=len(transitions),
151
+ force_dimensions=False,
152
+ )
153
+ states_with_transitions = set(src for src, _, _ in transitions)
154
+ outgoing_probs = {i: 0.0 for i in range(num_states)}
155
+
156
+ for src, dest, prob in transitions:
157
+ builder.add_next_value(src, dest, prob)
158
+ outgoing_probs[src] += prob
159
+
160
+ for state in range(num_states):
161
+ if state not in states_with_transitions:
162
+ builder.add_next_value(state, state, 1.0)
163
+ outgoing_probs[state] = 1.0
164
+
165
+ # Check probabilities
166
+ for state, prob_sum in outgoing_probs.items():
167
+ # if not math.isclose(prob_sum, 1.0, rel_tol=1e-9):
168
+ if not math.isclose(prob_sum, 1.0, abs_tol=1e-2):
169
+ logging.warning(f"State {state} has outgoing probability sum of {prob_sum}, not 1.0")
170
+
171
+ # ... (existing logging code) ...
172
+ trans_matrix = builder.build()
173
+ return trans_matrix
174
+
175
+ def _build_label_func(
176
+ self,
177
+ states: list[VideoState],
178
+ props: list[str],
179
+ model_type: str = "nondeterministic",
180
+ ) -> stormpy.storage.StateLabeling:
181
+ """Build label function.
182
+
183
+ Args:
184
+ states (list[State]): List of states.
185
+ props (list[str]): List of propositions.
186
+ model_type (str): Type of model
187
+ ("nondeterministic" or "deterministic").
188
+
189
+ Returns:
190
+ stormpy.storage.StateLabeling: State labeling.
191
+ """
192
+ state_labeling = stormpy.storage.StateLabeling(len(states))
193
+ state_labeling.add_label("init")
194
+ state_labeling.add_label("terminal")
195
+ for label in props:
196
+ state_labeling.add_label(label)
197
+
198
+ if model_type == "nondeterministic":
199
+ for state in states:
200
+ for label in state.descriptive_label:
201
+ state_labeling.add_label_to_state(label, state.state_index)
202
+ else:
203
+ for i, state in enumerate(states):
204
+ for prop in state.props:
205
+ if prop in props:
206
+ state_labeling.add_label_to_state(prop, i)
207
+ return state_labeling
208
+
209
+ def validate_tl_specification(self, ltl_formula: str) -> bool:
210
+ """Validate LTL specification.
211
+
212
+ Args:
213
+ ltl_formula: LTL formula to validate.
214
+ """
215
+ path = stormpy.examples.files.prism_dtmc_die # prism_mdp_maze
216
+ prism_program = stormpy.parse_prism_program(path)
217
+ # Define Properties
218
+ try:
219
+ stormpy.parse_properties(ltl_formula, prism_program)
220
+ except Exception as e:
221
+ msg = f"Error validating LTL specification: {e}"
222
+ logging.exception(msg)
223
+ return False
224
+ else:
225
+ return True
ns_vfs/model_checker/video_automaton.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ns_vfs.model_checker.video_state import VideoState
2
+ from ns_vfs.video.frame import VideoFrame
3
+
4
+
5
+ class VideoAutomaton:
6
+ """Represents a Markov Automaton for video state modeling."""
7
+
8
+ def __init__(self, include_initial_state: bool = False) -> None:
9
+ """Initialize the MarkovAutomaton.
10
+
11
+ Args:
12
+ include_initial_state (bool, optional): Whether to include
13
+ the initial state. Defaults to False.
14
+ proposition_set (list[str] | None, optional): List of propositions.
15
+ Defaults to None.
16
+ """
17
+ self.previous_states: list[VideoState] = []
18
+ self.states: list[VideoState] = []
19
+ self.transitions = []
20
+ self.include_initial_state = include_initial_state
21
+
22
+ def set_up(self, proposition_set: list[str]) -> None:
23
+ """Set up the MarkovAutomaton."""
24
+ self.proposition_set = proposition_set
25
+ self.label_combinations = self._create_label_combinations(len(proposition_set))
26
+ self.probability_of_propositions = [[] for _ in range(len(proposition_set))]
27
+ self.frame_index_in_automaton = 0
28
+
29
+ if self.include_initial_state:
30
+ initial_state = VideoState(
31
+ state_index=0,
32
+ frame_index=-1,
33
+ label="init",
34
+ proposition_set=proposition_set,
35
+ )
36
+ self.previous_states = [initial_state]
37
+ self.states = [initial_state]
38
+ self._current_state = initial_state
39
+
40
+ def reset(self) -> None:
41
+ """Reset automaton."""
42
+ self.__init__(self.include_initial_state)
43
+ self.set_up(self.proposition_set)
44
+
45
+ def add_frame(self, frame: VideoFrame) -> None:
46
+ """Add frame to automaton."""
47
+ self._get_probability_of_propositions(frame)
48
+ current_states = []
49
+ for prop_comb in self.label_combinations:
50
+ # iterate through all possible combinations of T and F
51
+ self._current_state = VideoState(
52
+ state_index=len(self.states),
53
+ frame_index=self.frame_index_in_automaton,
54
+ label=prop_comb,
55
+ proposition_set=self.proposition_set,
56
+ )
57
+ # TODO: Make a method for update and compute probability
58
+ self._current_state.update(
59
+ frame_index=self.frame_index_in_automaton,
60
+ target_label=prop_comb,
61
+ )
62
+ self._current_state.compute_probability(probabilities=self.probability_of_propositions)
63
+ if self._current_state.probability > 0:
64
+ self.states.append(self._current_state)
65
+ current_states.append(self._current_state)
66
+
67
+ # Build transitions from previous states to current states
68
+ if self.previous_states:
69
+ for prev_state in self.previous_states:
70
+ for cur_state in current_states:
71
+ transition = (
72
+ prev_state.state_index,
73
+ cur_state.state_index,
74
+ cur_state.probability,
75
+ )
76
+ self.transitions.append(transition)
77
+
78
+ self.previous_states = current_states if current_states else self.previous_states
79
+ self.frame_index_in_automaton += 1
80
+
81
+ def add_terminal_state(self, add_with_terminal_label: bool = False) -> None:
82
+ """Add terminal state to the automaton."""
83
+ if add_with_terminal_label:
84
+ terminal_state_index = len(self.states)
85
+ terminal_state = VideoState(
86
+ state_index=terminal_state_index,
87
+ frame_index=self.frame_index_in_automaton,
88
+ label="terminal",
89
+ proposition_set=self.proposition_set,
90
+ )
91
+ self.states.append(terminal_state)
92
+ self._current_state = terminal_state
93
+
94
+ self.transitions.extend(
95
+ (prev_state.state_index, terminal_state_index, 1.0) for prev_state in self.previous_states
96
+ )
97
+ self.transitions.append((terminal_state_index, terminal_state_index, 1.0))
98
+ else:
99
+ self.transitions.extend(
100
+ (prev_state.state_index, prev_state.state_index, 1.0) for prev_state in self.previous_states
101
+ )
102
+
103
+ def get_frame_to_state_index(self) -> dict[int, list[int]]:
104
+ """Get frame to state index mapping."""
105
+ data = {}
106
+ for state in self.states:
107
+ if state.frame_index not in data:
108
+ data[state.frame_index] = []
109
+ data[state.frame_index].append(state.state_index)
110
+ return data
111
+
112
+ def _get_probability_of_propositions(self, frame: VideoFrame) -> None:
113
+ """Update the probability of propositions."""
114
+ for i, prop in enumerate(self.proposition_set):
115
+ if frame.object_of_interest.get(prop):
116
+ probability = frame.object_of_interest[prop].get_detected_probability()
117
+ else:
118
+ prop = prop.replace("_", " ")
119
+ if frame.object_of_interest.get(prop):
120
+ probability = frame.object_of_interest[prop].get_detected_probability()
121
+ else:
122
+ probability = 0
123
+ self.probability_of_propositions[i].append(round(float(probability), 2))
124
+
125
+ def _create_label_combinations(self, num_props: int) -> list[str]:
126
+ """Create all possible combinations of T and F for the number of propositions.
127
+
128
+ Args:
129
+ num_props (int): Number of propositions.
130
+
131
+ Returns:
132
+ list[str]: List of all possible combinations of T and F.
133
+ """
134
+ label_list = []
135
+
136
+ def add_labels(num_props: int, label: str, label_list: list[str]) -> None:
137
+ if len(label) == num_props:
138
+ label_list.append(label)
139
+ return
140
+ add_labels(num_props, label + "T", label_list)
141
+ add_labels(num_props, label + "F", label_list)
142
+
143
+ add_labels(num_props, "", label_list)
144
+ return label_list
ns_vfs/model_checker/video_state.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class VideoState:
2
+ """Video state class."""
3
+
4
+ def __init__(
5
+ self,
6
+ state_index: int,
7
+ frame_index: int,
8
+ label: str,
9
+ proposition_set: list[str],
10
+ probability: float = 1.0,
11
+ ) -> None:
12
+ """State class.
13
+
14
+ Args:
15
+ state_index (int): state_index.
16
+ frame_index (int): Frame index.
17
+ label (str): Label set. :abel is a string with characters T or F
18
+ indicating True or False
19
+ proposition_set (list[str]): Proposition set.
20
+ probability (float): Probability of the state.
21
+ """
22
+ self.state_index = state_index
23
+ self.frame_index = frame_index
24
+ self.proposition_set = proposition_set
25
+ self.label = label # "init", "terminal", TTT, TFT, FTT, etc.
26
+ self.descriptive_label = self._get_descriptive_label(label=label)
27
+ self.probability = probability
28
+
29
+ def __repr__(self) -> str:
30
+ """Representation of state."""
31
+ return f"{self.frame_index}|{self.state_index} ({self.probability}): {self.label}"
32
+
33
+ def __str__(self) -> str:
34
+ """String of state."""
35
+ return f"{self.__repr__()}"
36
+
37
+ def _get_descriptive_label(self, label: str) -> list:
38
+ """Get descriptive label.
39
+
40
+ Args:
41
+ label (str): Label.
42
+ """
43
+ labels = []
44
+ if label == "init":
45
+ labels.append("init")
46
+ elif label == "terminal":
47
+ labels.append("terminal")
48
+ else:
49
+ for i in range(len(self.proposition_set)):
50
+ if label[i] == "T":
51
+ labels.append(self.proposition_set[i])
52
+ return labels
53
+
54
+ def update(self, frame_index: int, target_label: str) -> None:
55
+ """Update state to the new state..
56
+
57
+ Args:
58
+ frame_index (int): Frame index.
59
+ target_label (str): Target label for the new state.
60
+ """
61
+ self.frame_index = frame_index
62
+ self.label = target_label # TTT, TFT, FTT, etc.
63
+ self.descriptive_label = self._get_descriptive_label(label=target_label)
64
+ self.probability = 1.0
65
+
66
+ def compute_probability(self, probabilities: list[list[float]]) -> None:
67
+ """Compute probability of the state given the probabilities of the propositions.
68
+
69
+ Args:
70
+ probabilities (list): list of probabilities of the propositions
71
+ e.g. two propositions with three frames
72
+ -> [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]].
73
+ """
74
+ probability = 1.0
75
+ for i in range(len(self.label)):
76
+ if self.label[i] == "T":
77
+ probability *= probabilities[i][self.frame_index]
78
+ else:
79
+ probability *= 1 - probabilities[i][self.frame_index]
80
+ self.probability = round(probability, 3)
ns_vfs/nsvs.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import warnings
3
+ import tqdm
4
+ import os
5
+
6
+ from ns_vfs.model_checker.property_checker import PropertyChecker
7
+ from ns_vfs.model_checker.video_automaton import VideoAutomaton
8
+ from ns_vfs.video.frame import FramesofInterest
9
+ from ns_vfs.vlm.vllm_client import VLLMClient
10
+ from ns_vfs.video.frame import VideoFrame
11
+ from ns_vfs.vlm.internvl import InternVL
12
+
13
+ PRINT_ALL = True
14
+ warnings.filterwarnings("ignore")
15
+
16
+ def run_nsvs(
17
+ frames: list[np.ndarray],
18
+ proposition: list,
19
+ specification: str,
20
+ model_name: str = "InternVL2-8B",
21
+ device: int = 0,
22
+ model_type: str = "dtmc",
23
+ num_of_frame_in_sequence = 3,
24
+ tl_satisfaction_threshold: float = 0.6,
25
+ detection_threshold: float = 0.5,
26
+ vlm_detection_threshold: float = 0.35,
27
+ image_output_dir: str = "output"
28
+ ):
29
+ """Find relevant frames from a video that satisfy a specification"""
30
+
31
+ object_frame_dict = {}
32
+ vlm = VLLMClient()
33
+ # vlm = InternVL(model_name=model_name, device=device)
34
+
35
+ automaton = VideoAutomaton(include_initial_state=True)
36
+ automaton.set_up(proposition_set=proposition)
37
+
38
+ checker = PropertyChecker(
39
+ proposition=proposition,
40
+ specification=specification,
41
+ model_type=model_type,
42
+ tl_satisfaction_threshold=tl_satisfaction_threshold,
43
+ detection_threshold=detection_threshold
44
+ )
45
+
46
+ frame_of_interest = FramesofInterest(num_of_frame_in_sequence)
47
+
48
+ frame_windows = []
49
+ for i in range(0, len(frames), num_of_frame_in_sequence):
50
+ frame_windows.append(frames[i : i + num_of_frame_in_sequence])
51
+
52
+ def process_frame(sequence_of_frames: list[np.ndarray], frame_count: int):
53
+ object_of_interest = {}
54
+
55
+ for prop in proposition:
56
+ detected_object = vlm.detect(
57
+ seq_of_frames=sequence_of_frames,
58
+ scene_description=prop,
59
+ threshold=vlm_detection_threshold
60
+ )
61
+ object_of_interest[prop] = detected_object
62
+ if detected_object.is_detected:
63
+ multi_frame_arr = [frame_count * num_of_frame_in_sequence + j for j in range(num_of_frame_in_sequence)]
64
+ if prop in object_frame_dict:
65
+ object_frame_dict[prop].extend(multi_frame_arr)
66
+ else:
67
+ object_frame_dict[prop] = multi_frame_arr
68
+ if PRINT_ALL:
69
+ print(f"\t{prop}: {detected_object.confidence}->{detected_object.probability}")
70
+
71
+ frame = VideoFrame(
72
+ frame_idx=frame_count,
73
+ frame_images=sequence_of_frames,
74
+ object_of_interest=object_of_interest,
75
+ )
76
+ return frame
77
+
78
+ if PRINT_ALL:
79
+ looper = enumerate(frame_windows)
80
+ else:
81
+ looper = tqdm.tqdm(enumerate(frame_windows), total=len(frame_windows))
82
+
83
+ for i, sequence_of_frames in looper:
84
+ if PRINT_ALL:
85
+ print("\n" + "*"*50 + f" {i}/{len(frame_windows)-1} " + "*"*50)
86
+ print("Detections:")
87
+ frame = process_frame(sequence_of_frames, i)
88
+ if PRINT_ALL:
89
+ os.makedirs(image_output_dir, exist_ok=True)
90
+ frame.save_frame_img(save_path=os.path.join(image_output_dir, f"{i}"))
91
+
92
+ if checker.validate_frame(frame_of_interest=frame):
93
+ automaton.add_frame(frame=frame)
94
+ frame_of_interest.frame_buffer.append(frame)
95
+ model_check = checker.check_automaton(automaton=automaton)
96
+ if model_check:
97
+ automaton.reset()
98
+ frame_of_interest.flush_frame_buffer()
99
+
100
+ foi = frame_of_interest.foi_list
101
+
102
+ if PRINT_ALL:
103
+ print("\n" + "-"*107)
104
+ print("Detected frames of interest:")
105
+ print(foi)
106
+
107
+ return foi, object_frame_dict
108
+
ns_vfs/puls/__init__.py ADDED
File without changes
ns_vfs/puls/llm.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import os
4
+
5
+ class LLM:
6
+ def __init__(self, client, save_dir=""): # pass in save_dir to start saving
7
+ self.client = client
8
+ self.history = []
9
+ self.save_dir = save_dir
10
+ if save_dir != "":
11
+ os.makedirs(save_dir, exist_ok=True)
12
+
13
+ def prompt(self, p, openai_model):
14
+ user_message = {"role": "user", "content": [{"type": "text", "text": p}]}
15
+ self.history.append(user_message)
16
+
17
+ response = self.client.chat.completions.create(
18
+ model=openai_model,
19
+ messages=self.history,
20
+ store=False,
21
+ )
22
+ assistant_response = response.choices[0].message.content
23
+ assistant_message = {"role": "assistant", "content": [{"type": "text", "text": assistant_response}]}
24
+ self.history.append(assistant_message)
25
+
26
+ return assistant_response
27
+
28
+ def save_history(self, filename="conversation_history.json"):
29
+ if self.save_dir == "":
30
+ return None
31
+
32
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
33
+ base_name, extension = os.path.splitext(filename)
34
+ timestamped_filename = f"{base_name}_{timestamp}{extension}"
35
+
36
+ save_path = os.path.join(self.save_dir, timestamped_filename)
37
+ try:
38
+ with open(save_path, "w", encoding="utf-8") as f:
39
+ json.dump(self.history, f, indent=4, ensure_ascii=False)
40
+ return save_path
41
+ except Exception as e:
42
+ print(f"Failed to save conversation history: {e}")
43
+ return None
44
+
ns_vfs/puls/prompts.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def find_prompt(prompt):
2
+ full_prompt = f"""
3
+ You are an intelligent agent designed to extract structured representations from video description prompts. You will operate in two stages: (1) proposition extraction and (2) TL specification generation.
4
+
5
+ Stage 1: Proposition Extraction
6
+
7
+ Given an input prompt describing a sequence in a video, extract the atomic propositions that describe the underlying events or facts explicity referenced. These propositions should describe the combined semantics of object-action or object-object relationships stated in the prompt — avoid making assumptions or inferring any additional events. Avoid TL keywords such as 'and', 'or', 'not', 'until'.
8
+ For example, given the prompt "A man is eating until he gets up", the correct propositions are ["man eats", "man gets up"].
9
+
10
+ Stage 2: TL Specification Generation
11
+
12
+ Using only the list of the propositions extracted in Stage 1, generate a single Temporal Logic (TL) specification that catpures the sequence of logical structure implied by the initial prompt.
13
+
14
+ Rules:
15
+ - The formula must use each proposition **exactly once**
16
+ - Use only the TL operators: `AND`, `OR`, `NOT`, `UNTIL`
17
+ - Do **not** infer new events or rephrase propositions.
18
+ - The formula should reflect the temporal or logical relationships between the propositions in a way that makes semantic sense.
19
+
20
+ **Examples**
21
+
22
+ Example 1: "A child is playing with his kite and running around before he unfortunately falls down"
23
+ Output:
24
+ {{
25
+ "proposition": ["child plays with kite", "child runs around", "child falls"],
26
+ "specification": "(child plays with kite AND child runs around) UNTIL child falls"
27
+ }}
28
+
29
+ Example 2: "In a dimly lit room, two robots stand silently. Suddenely, either the red robot starts blinking or the green robot does not turn off."
30
+ Output:
31
+ {{
32
+ "proposition": ["robots stand silently", "red robot starts blinking", "green robot turns off"],
33
+ "specification": "robots stand silently UNTIL (red robot starts blinking OR NOT green robot turns off)"
34
+ }}
35
+
36
+ Example 3: "Inside a cave, a man holds a lantern. A minute after, he suddenely sees a dragon."
37
+ Output:
38
+ {{
39
+ "proposition": ["man holds lantern", "man sees dragon"],
40
+ "specification": "man holds lantern UNTIL man sees dragon"
41
+ }}
42
+
43
+ Example 6: "The girl is turning on the computer."
44
+ Output:
45
+ {{
46
+ "proposition": ["girl turns on computer"],
47
+ "specification": "(girl turns on computer)"
48
+ }}
49
+
50
+ **Now process the following prompt:**
51
+ Input:
52
+ {{
53
+ "prompt": "{prompt}"
54
+ }}
55
+
56
+ Expected Output (only output the following JSON structure — nothing else):
57
+ {{
58
+ "proposition": [...],
59
+ "specification": "..."
60
+ }}
61
+ """
62
+ return full_prompt
ns_vfs/puls/puls.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ns_vfs.puls.llm import *
2
+ from ns_vfs.puls.prompts import *
3
+ from openai import OpenAI
4
+ import json
5
+ import os
6
+ import re
7
+
8
+ def clean_and_parse_json(raw_str):
9
+ start = raw_str.find('{')
10
+ end = raw_str.rfind('}') + 1
11
+ json_str = raw_str[start:end]
12
+ return json.loads(json_str)
13
+
14
+ def process_specification(specification, propositions):
15
+ new_propositions = []
16
+ for prop in propositions:
17
+ prop_cleaned = re.sub(r"^[^a-zA-Z]+|[^a-zA-Z]+$", "", prop)
18
+ prop_cleaned = re.sub(r"\s+", "_", prop_cleaned)
19
+ prop_cleaned = prop_cleaned.replace("'", "")
20
+ new_propositions.append(prop_cleaned)
21
+
22
+ for original, new in zip(propositions, new_propositions):
23
+ specification = specification.replace(original, f'"{new}"')
24
+
25
+ replacements = {
26
+ "AND": "&",
27
+ "OR": "|",
28
+ "UNTIL": "U",
29
+ "ALWAYS": "G",
30
+ "EVENTUALLY": "F",
31
+ "NOT": "!"
32
+ }
33
+ for word, symbol in replacements.items():
34
+ specification = specification.replace(word, symbol)
35
+
36
+ return new_propositions, specification
37
+
38
+ def PULS(query, openai_save_path, openai_model="o1-mini", openai_key=None):
39
+ if openai_key:
40
+ os.environ["OPENAI_API_KEY"] = openai_key
41
+
42
+ client = OpenAI()
43
+ llm = LLM(client, save_dir=openai_save_path)
44
+
45
+ full_prompt = find_prompt(query)
46
+ llm_output = llm.prompt(full_prompt, openai_model)
47
+ parsed = clean_and_parse_json(llm_output)
48
+
49
+ final_output = {}
50
+
51
+ cleaned_props, processed_spec = process_specification(parsed["specification"], parsed["proposition"])
52
+ final_output["proposition"] = cleaned_props
53
+ final_output["specification"] = processed_spec
54
+
55
+ saved_path = llm.save_history()
56
+ final_output["saved_path"] = saved_path
57
+
58
+ return final_output
ns_vfs/video/frame.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import numpy as np
3
+ import cv2
4
+
5
+
6
+ class VideoFrame:
7
+ """Frame class."""
8
+ def __init__(
9
+ self,
10
+ frame_idx: int,
11
+ frame_images: List[np.ndarray],
12
+ object_of_interest: dict
13
+ ):
14
+ self.frame_idx = frame_idx
15
+ self.frame_images = frame_images
16
+ self.object_of_interest = object_of_interest
17
+
18
+ def save_frame_img(self, save_path: str) -> None:
19
+ """Save frame image."""
20
+ if self.frame_images is not None:
21
+ for idx, img in enumerate(self.frame_images):
22
+ cv2.imwrite(f"{save_path}_{idx}.png", cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
23
+
24
+ def thresholded_detected_objects(self, threshold) -> dict:
25
+ """Get all detected object."""
26
+
27
+ detected_obj = {}
28
+ for prop in self.object_of_interest.keys():
29
+ probability = self.object_of_interest[prop].get_detected_probability()
30
+ if probability > threshold:
31
+ detected_obj[prop] = probability
32
+ return detected_obj
33
+
34
+
35
+ class FramesofInterest:
36
+ def __init__(self, num_of_frame_in_sequence):
37
+ self.num_of_frame_in_sequence = num_of_frame_in_sequence
38
+ self.foi_list = []
39
+ self.frame_buffer = []
40
+
41
+ def flush_frame_buffer(self):
42
+ """Flush frame buffer to frame of interest."""
43
+ if self.frame_buffer:
44
+ frame_interval = [frame.frame_idx for frame in self.frame_buffer]
45
+ self.foi_list.append([
46
+ i*self.num_of_frame_in_sequence + j
47
+ for i in frame_interval
48
+ for j in range(self.num_of_frame_in_sequence)
49
+ ])
50
+ self.frame_buffer = []
ns_vfs/video/read_mp4.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any
2
+ from tqdm import tqdm
3
+ import numpy as np
4
+ import cv2
5
+ import os
6
+
7
+ from ns_vfs.video.reader import VideoFormat, VideoInfo, Reader
8
+ from ns_vfs.puls.puls import PULS
9
+
10
+
11
+ class Mp4Reader(Reader):
12
+ def __init__(self, videos: List[Dict[str, str]], openai_save_path: str, sampling_rate_fps: float = 1.0):
13
+ self.videos = videos
14
+ if sampling_rate_fps is None or sampling_rate_fps <= 0:
15
+ raise ValueError("sampling_rate_fps must be > 0")
16
+ self.openai_save_path = openai_save_path
17
+ self.sampling_rate_fps = float(sampling_rate_fps)
18
+
19
+ def _sampled_frame_indices(self, fps: float, frame_count: int) -> List[int]:
20
+ if fps <= 0:
21
+ fps = 1.0
22
+
23
+ duration_sec = frame_count / fps if frame_count > 0 else 0.0
24
+ step_sec = 1.0 / self.sampling_rate_fps
25
+
26
+ times = [t for t in np.arange(0.0, duration_sec + 1e-9, step_sec)]
27
+ idxs = sorted(set(int(round(t * fps)) for t in times if t * fps < frame_count))
28
+ if not idxs and frame_count > 0:
29
+ idxs = [0]
30
+ return idxs
31
+
32
+ def _read_one(self, video_query: Dict[str, str]) -> Dict[str, Any] | None:
33
+ path = video_query["path"]
34
+ query = video_query["query"]
35
+
36
+ cap = cv2.VideoCapture(path)
37
+ if not cap.isOpened():
38
+ return None
39
+
40
+ fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
41
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
42
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or 0)
43
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0)
44
+
45
+ frame_idxs = self._sampled_frame_indices(fps, frame_count)
46
+
47
+ images: List[np.ndarray] = []
48
+ for idx in frame_idxs:
49
+ cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
50
+ ok, frame_bgr = cap.read()
51
+ if not ok or frame_bgr is None:
52
+ continue
53
+ frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
54
+ images.append(frame_rgb)
55
+
56
+ if (width == 0 or height == 0) and images:
57
+ height, width = images[0].shape[:2]
58
+
59
+ video_info = VideoInfo(
60
+ format=VideoFormat.MP4,
61
+ frame_width=width,
62
+ frame_height=height,
63
+ frame_count=frame_count,
64
+ fps=float(fps) if fps else None,
65
+ )
66
+
67
+ puls_output = PULS(query, self.openai_save_path, openai_key=OPENAI_API_KEY)
68
+
69
+ cap.release()
70
+ entry = {
71
+ "tl": {
72
+ "propositions": puls_output["proposition"],
73
+ "specification": puls_output["specification"],
74
+ "query": query,
75
+ },
76
+ "metadata": {
77
+ "video_path": path,
78
+ "sampling_rate_fps": self.sampling_rate_fps,
79
+ "puls_saved_path": puls_output["saved_path"],
80
+ },
81
+ "video_info": video_info,
82
+ "images": images,
83
+ }
84
+ return entry
85
+
86
+ def read_video(self) -> List[Dict[str, Any]]:
87
+ results: List[Dict[str, Any]] = []
88
+ with tqdm(total=len(self.videos), desc="Reading MP4s") as pbar:
89
+ for v in self.videos:
90
+ entry = self._read_one(v)
91
+ if entry is not None:
92
+ results.append(entry)
93
+ pbar.update(1)
94
+ return results
ns_vfs/video/read_tlv.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Iterable
2
+ from tqdm import tqdm
3
+ import numpy as np
4
+ import pickle
5
+ import os
6
+
7
+ from ns_vfs.video.reader import VideoFormat, VideoInfo, Reader
8
+
9
+
10
+ class TLVReader(Reader):
11
+ def __init__(self, tlv_path: str):
12
+ self.tlv_path = tlv_path # /nas/dataset/tlv-dataset-v1
13
+
14
+ def _iter_tlv(self) -> Iterable[tuple[str, str, str]]:
15
+ for dataset_dir in os.listdir(self.tlv_path):
16
+ dataset_path = os.path.join(self.tlv_path, dataset_dir)
17
+ if not os.path.isdir(dataset_path):
18
+ continue
19
+ for format_dir in os.listdir(dataset_path):
20
+ format_path = os.path.join(dataset_path, format_dir)
21
+ if not os.path.isdir(format_path):
22
+ continue
23
+ for fname in os.listdir(format_path):
24
+ if fname.endswith(".pkl"):
25
+ yield dataset_dir, format_dir, os.path.join(format_path, fname)
26
+
27
+
28
+ def read_video(self) -> List[Dict[str, Any]]:
29
+ entries: List[Dict[str, Any]] = []
30
+
31
+ total = sum(1 for _ in self._iter_tlv())
32
+ with tqdm(total=total, desc="Loading TLV files") as pbar:
33
+ for dataset_dir, format_dir, file_path in self._iter_tlv():
34
+ with open(file_path, "rb") as f:
35
+ raw = pickle.load(f)
36
+
37
+ images: List[np.ndarray] = raw["images_of_frames"]
38
+ if len(images) == 0:
39
+ pbar.update(1)
40
+ continue
41
+
42
+ video_info = VideoInfo(
43
+ format=VideoFormat.LIST_OF_ARRAY,
44
+ frame_width=images[0].shape[1],
45
+ frame_height=images[0].shape[0],
46
+ frame_count=len(images),
47
+ fps=0.1, # 1 frame/10 sec
48
+ )
49
+
50
+ entry = {
51
+ "tl": {
52
+ "propositions": raw["proposition"],
53
+ "specification": raw["ltl_formula"],
54
+ "query": self.formatter(raw["ltl_formula"]),
55
+ },
56
+ "metadata": {
57
+ "type": {"dataset": dataset_dir, "format": format_dir},
58
+ "ground_truth": [i for sub in raw["frames_of_interest"] for i in sub],
59
+ },
60
+ "video_info": video_info,
61
+ "images": images,
62
+ }
63
+ entries.append(entry)
64
+ pbar.update(1)
65
+
66
+ return entries
67
+
ns_vfs/video/reader.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field, asdict
2
+ from abc import ABC, abstractmethod
3
+ from typing import List, Dict, Any
4
+ import enum
5
+ import uuid
6
+
7
+
8
+ class VideoFormat(enum.Enum):
9
+ MP4 = "mp4"
10
+ LIST_OF_ARRAY = "list_of_array"
11
+
12
+ @dataclass
13
+ class VideoInfo:
14
+ format: VideoFormat
15
+ frame_width: int
16
+ frame_height: int
17
+ frame_count: int
18
+ video_id: uuid.UUID = field(default_factory=uuid.uuid4)
19
+ fps: float | None = None
20
+
21
+ def to_dict(self):
22
+ d = asdict(self)
23
+ d["video_id"] = str(self.video_id)
24
+ d["format"] = self.format.value
25
+ return d
26
+
27
+ class Reader(ABC):
28
+ @abstractmethod
29
+ def read_video(self) -> List[Dict[str, Any]]:
30
+ pass
31
+
32
+ def formatter(self, spec: str) -> str:
33
+ spec = spec.replace("&", " and ")
34
+ spec = spec.replace("|", " or ")
35
+ spec = spec.replace("U", " until ")
36
+ spec = spec.replace("F", " eventually ")
37
+ spec = spec.replace("G", " always ")
38
+ spec = spec.replace("X", " next ")
39
+ spec = spec.replace('"', "")
40
+ spec = spec.replace("'", "")
41
+ spec = spec.replace("(", "")
42
+ spec = spec.replace(")", "")
43
+ while " " in spec:
44
+ spec = spec.replace(" ", " ")
45
+ spec = spec.strip()
46
+ return spec
47
+
48
+
ns_vfs/vlm/__init__.py ADDED
File without changes
ns_vfs/vlm/internvl.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import logging
3
+
4
+ import numpy as np
5
+ import copy
6
+ import torch
7
+ from torch.nn.functional import softmax
8
+ from transformers import AutoModel, AutoTokenizer
9
+
10
+ from ns_vfs.vlm.internvl_utils import (
11
+ assign_device_map,
12
+ load_video_from_seq_of_frames,
13
+ split_model,
14
+ )
15
+ from ns_vfs.vlm.obj import DetectedObject
16
+
17
+ class InternVL:
18
+ """InternVL's Vision Language Model."""
19
+
20
+ def __init__(
21
+ self,
22
+ model_name: str = "InternVL2-8B",
23
+ multi_gpus: bool = False,
24
+ device: int = 0,
25
+ ) -> None:
26
+ """Initialization the InternVL."""
27
+ logging.info(
28
+ (
29
+ "You are using the model based on HuggingFace API.",
30
+ "The model will be downloaded to the HuggingFace cache dir.",
31
+ )
32
+ )
33
+ self.model_name = model_name
34
+ self._path = f"OpenGVLab/{model_name}"
35
+ self._num_gpus = torch.cuda.device_count()
36
+ self.device = device
37
+ if multi_gpus:
38
+ device_map = split_model(model_name)
39
+ else:
40
+ device_map = assign_device_map(model_name=model_name, manual_gpu_id=device)
41
+ self.model = AutoModel.from_pretrained(
42
+ self._path,
43
+ torch_dtype=torch.bfloat16,
44
+ low_cpu_mem_usage=True,
45
+ use_flash_attn=True,
46
+ trust_remote_code=True,
47
+ device_map=device_map,
48
+ ).eval()
49
+ self.model.apply(self.move_tensors_to_gpu)
50
+ self.tokenizer = AutoTokenizer.from_pretrained(self._path, trust_remote_code=True, use_fast=False)
51
+
52
+ def reset_model(self) -> None:
53
+ """Reset the model to its initial state using pretrained weights."""
54
+ self.model = AutoModel.from_pretrained(
55
+ self._path,
56
+ torch_dtype=torch.bfloat16,
57
+ low_cpu_mem_usage=True,
58
+ use_flash_attn=True,
59
+ trust_remote_code=True,
60
+ ).eval()
61
+ self.model.apply(self.move_tensors_to_gpu)
62
+
63
+ def clear_gpu_memory(self) -> None:
64
+ """Clear CUDA cache and run garbage collection to free GPU memory."""
65
+ torch.cuda.empty_cache()
66
+ if torch.cuda.is_available():
67
+ torch.cuda.ipc_collect()
68
+ gc.collect() # Run garbage collector
69
+
70
+ def move_tensors_to_gpu(
71
+ self,
72
+ module: torch.nn.Module,
73
+ ) -> None:
74
+ """Move all tensors in the module to GPU if they are on the CPU."""
75
+ for name, tensor in module.named_buffers():
76
+ if isinstance(tensor, torch.Tensor) and tensor.device.type == "cpu":
77
+ module.register_buffer(
78
+ name,
79
+ tensor.cuda(self.device),
80
+ persistent=False,
81
+ )
82
+ for _, param in module.named_parameters():
83
+ if param.device.type == "cpu":
84
+ param.data = param.data.cuda(self.device)
85
+
86
+ def detect(
87
+ self,
88
+ seq_of_frames: list[np.ndarray],
89
+ scene_description: str,
90
+ threshold: float
91
+ ) -> DetectedObject:
92
+ """Detect objects in the given frame image.
93
+
94
+ Args:
95
+ seq_of_frames (list[np.ndarray]): List of video frames to process.
96
+ scene_description (str): Description of the scene.
97
+ threshold (float): Detection threshold.
98
+
99
+ Returns:
100
+ DetectedObject: Detected objects with their details.
101
+ """
102
+ parsing_rule = "You must only return a Yes or No, and not both, to any question asked. You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times. For example, if the question is \"Is there a cat present in the sequence of images?\", the answer must only be 'Yes' or 'No'."
103
+ prompt = rf"Is there a {scene_description} present in the sequence of images? " f"\n[PARSING RULE]: {parsing_rule}"
104
+
105
+ response, confidence = self.infer_with_video_confidence(
106
+ language=prompt,
107
+ seq_of_frames=seq_of_frames,
108
+ )
109
+
110
+ detected = "yes" in response.lower()
111
+ probability = self.calibrate(confidence, false_threshold=threshold)
112
+
113
+ return DetectedObject(
114
+ name=scene_description,
115
+ is_detected=detected,
116
+ confidence=round(confidence, 3),
117
+ probability=round(probability, 3),
118
+ )
119
+
120
+ def infer_with_video_confidence(
121
+ self,
122
+ language: str,
123
+ seq_of_frames: list[np.ndarray],
124
+ max_new_tokens: int = 1024,
125
+ do_sample: bool = True,
126
+ ) -> tuple[str, float]:
127
+ """Perform video inference and return response with confidence score.
128
+
129
+ Args:
130
+ language (str): The input prompt or question.
131
+ seq_of_frames (list[np.ndarray] | None):
132
+ List of video frames as numpy arrays.
133
+ video_path (str | None): Path to the input video file.
134
+ max_new_tokens (int): Maximum number of new tokens to generate.
135
+ do_sample (bool): Whether to use sampling for generation.
136
+
137
+ Returns:
138
+ tuple[str, float]: Generated response and confidence score.
139
+ """
140
+
141
+ generation_config = {
142
+ "max_new_tokens": max_new_tokens,
143
+ "do_sample": do_sample,
144
+ }
145
+
146
+ pixel_values, num_patches_list = load_video_from_seq_of_frames(
147
+ seq_of_frames=seq_of_frames, device=self.device
148
+ )
149
+
150
+ video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
151
+ language = video_prefix + language
152
+
153
+ return self.chat_with_confidence(
154
+ self.tokenizer,
155
+ pixel_values,
156
+ language,
157
+ generation_config,
158
+ num_patches_list=num_patches_list,
159
+ )
160
+
161
+ def chat_with_confidence(
162
+ self,
163
+ tokenizer: AutoTokenizer,
164
+ pixel_values: torch.Tensor,
165
+ question: str,
166
+ generation_config: dict,
167
+ num_patches_list: list[int] | None = None,
168
+ IMG_START_TOKEN: str = "<img>",
169
+ IMG_END_TOKEN: str = "</img>",
170
+ IMG_CONTEXT_TOKEN: str = "<IMG_CONTEXT>",
171
+ verbose: bool = False,
172
+ ) -> tuple[str, float]:
173
+ """Generate a response with confidence score for the given input.
174
+
175
+ Args:
176
+ tokenizer: The tokenizer to use.
177
+ pixel_values: Image tensor input.
178
+ question: The input question or prompt.
179
+ generation_config: Configuration for text generation.
180
+ num_patches_list: List of number of patches for video frames.
181
+ IMG_START_TOKEN: Token to mark the start of an image.
182
+ IMG_END_TOKEN: Token to mark the end of an image.
183
+ IMG_CONTEXT_TOKEN: Token for image context.
184
+ verbose: Whether to print verbose output.
185
+
186
+ Returns:
187
+ A tuple containing the generated response and its confidence score.
188
+ """
189
+ if num_patches_list is None:
190
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
191
+
192
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
193
+
194
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
195
+ self.model.img_context_token_id = img_context_token_id
196
+
197
+ template = copy.deepcopy(self.model.conv_template)
198
+ template.system_message = self.model.system_message
199
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
200
+
201
+ template.append_message(template.roles[0], question)
202
+ template.append_message(template.roles[1], None)
203
+ query = template.get_prompt()
204
+
205
+ if verbose and pixel_values is not None:
206
+ image_bs = pixel_values.shape[0]
207
+ print(f"dynamic ViT batch size: {image_bs}")
208
+
209
+ for num_patches in num_patches_list:
210
+ context_tokens = IMG_CONTEXT_TOKEN * self.model.num_image_token * num_patches
211
+ image_tokens = IMG_START_TOKEN + context_tokens + IMG_END_TOKEN
212
+ query = query.replace("<image>", image_tokens, 1)
213
+
214
+ model_inputs = tokenizer(query, return_tensors="pt")
215
+ input_ids = model_inputs["input_ids"].cuda(self.device)
216
+ attention_mask = model_inputs["attention_mask"].cuda(self.device)
217
+ generation_config["eos_token_id"] = eos_token_id
218
+ generation_config["return_dict_in_generate"] = True
219
+ generation_config["output_scores"] = True
220
+ generation_config["output_logits"] = True
221
+ generation_output = self.model.generate(
222
+ pixel_values=pixel_values,
223
+ input_ids=input_ids,
224
+ attention_mask=attention_mask,
225
+ **generation_config,
226
+ )
227
+ response = tokenizer.batch_decode(generation_output.sequences, skip_special_tokens=True)[0]
228
+ response = response.split(template.sep)[0].strip()
229
+
230
+ logits_to_compute = np.where(generation_output.sequences[0].detach().cpu().numpy() != eos_token_id)[0]
231
+ confidence = 1.0
232
+ for logit in logits_to_compute:
233
+ token = generation_output.sequences[0, logit].item()
234
+ prob = softmax(generation_output.logits[logit], dim=0)[0, token]
235
+ confidence = prob.item() * confidence
236
+ self.clear_gpu_memory()
237
+
238
+ return response, confidence
239
+
240
+ def calibrate(
241
+ self,
242
+ confidence: float,
243
+ true_threshold=0.95,
244
+ false_threshold=0.40,
245
+ target_conf=0.60,
246
+ target_prob=0.78,
247
+ p_min=0.01,
248
+ p_max=0.99,
249
+ steepness_factor=0.7,
250
+ ) -> float:
251
+ """Map confidence to probability using a sigmoid function with adjustable steepness.
252
+
253
+ Args:
254
+ confidence: Input confidence score
255
+ true_threshold: Upper threshold
256
+ false_threshold: Lower threshold
257
+ target_conf: Target confidence point
258
+ target_prob: Target probability value
259
+ p_min: Minimum probability
260
+ p_max: Maximum probability
261
+ steepness_factor: Controls curve steepness (0-1, lower = less steep)
262
+ """
263
+ if confidence <= false_threshold:
264
+ return p_min
265
+
266
+ if confidence >= true_threshold:
267
+ return p_max
268
+
269
+ # Calculate parameters to ensure target_conf maps to target_prob
270
+ # For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0)))
271
+
272
+ # First, normalize the target point
273
+ x_norm = (target_conf - false_threshold) / (true_threshold - false_threshold)
274
+ y_norm = (target_prob - p_min) / (p_max - p_min)
275
+
276
+ # Find x0 (midpoint) and k (steepness) to satisfy our target point
277
+ x0 = 0.30 # Midpoint of normalized range
278
+
279
+ # Calculate base k value to hit the target point
280
+ base_k = -np.log(1 / y_norm - 1) / (x_norm - x0)
281
+
282
+ # Apply steepness factor (lower = less steep)
283
+ k = base_k * steepness_factor
284
+
285
+ # With reduced steepness, we need to adjust x0 to still hit the target point
286
+ # Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k
287
+ adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k
288
+
289
+ # Apply the sigmoid with our calculated parameters
290
+ x_scaled = (confidence - false_threshold) / (true_threshold - false_threshold)
291
+ sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0)))
292
+
293
+ # Ensure we still hit exactly p_min and p_max at the thresholds
294
+ # by rescaling the output slightly
295
+ min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0)))
296
+ max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0)))
297
+
298
+ # Normalize the output
299
+ normalized = (sigmoid_value - min_val) / (max_val - min_val)
300
+
301
+ return p_min + normalized * (p_max - p_min)
302
+
ns_vfs/vlm/internvl_utils.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torchvision.transforms as T
6
+ from decord import VideoReader, cpu
7
+ from PIL import Image
8
+ from torchvision.transforms.functional import InterpolationMode
9
+
10
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
11
+ IMAGENET_STD = (0.229, 0.224, 0.225)
12
+
13
+
14
+ def build_transform(input_size: int) -> T.Compose:
15
+ """Builds a transformation pipeline for the given input size."""
16
+ mean, std = IMAGENET_MEAN, IMAGENET_STD
17
+ return T.Compose(
18
+ [
19
+ T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
20
+ T.Resize(
21
+ (input_size, input_size),
22
+ interpolation=InterpolationMode.BICUBIC,
23
+ ),
24
+ T.ToTensor(),
25
+ T.Normalize(mean=mean, std=std),
26
+ ]
27
+ )
28
+
29
+
30
+ def assign_device_map(model_name, manual_gpu_id=0):
31
+ device_map = {}
32
+ world_size = torch.cuda.device_count()
33
+ num_layers = {
34
+ "InternVL2-1B": 24,
35
+ "InternVL2-2B": 24,
36
+ "InternVL2-4B": 32,
37
+ "InternVL2-8B": 32,
38
+ "InternVL2-26B": 48,
39
+ "InternVL2-40B": 60,
40
+ "InternVL2-Llama3-76B": 80,
41
+ }[model_name]
42
+ for layer_idx in range(num_layers):
43
+ device_map[f"language_model.model.layers.{layer_idx}"] = manual_gpu_id
44
+
45
+ device_map["vision_model"] = manual_gpu_id
46
+ device_map["mlp1"] = manual_gpu_id
47
+ device_map["language_model.model.tok_embeddings"] = manual_gpu_id
48
+ device_map["language_model.model.embed_tokens"] = manual_gpu_id
49
+ device_map["language_model.output"] = manual_gpu_id
50
+ device_map["language_model.model.norm"] = manual_gpu_id
51
+ device_map["language_model.lm_head"] = manual_gpu_id
52
+ device_map[f"language_model.model.layers.{num_layers - 1}"] = manual_gpu_id
53
+
54
+ return device_map
55
+
56
+
57
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
58
+ best_ratio_diff = float("inf")
59
+ best_ratio = (1, 1)
60
+ area = width * height
61
+ for ratio in target_ratios:
62
+ target_aspect_ratio = ratio[0] / ratio[1]
63
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
64
+ if ratio_diff < best_ratio_diff:
65
+ best_ratio_diff = ratio_diff
66
+ best_ratio = ratio
67
+ elif ratio_diff == best_ratio_diff:
68
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
69
+ best_ratio = ratio
70
+ return best_ratio
71
+
72
+
73
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
74
+ # Convert numpy array to PIL Image if needed
75
+ if isinstance(image, np.ndarray):
76
+ image = Image.fromarray(image)
77
+
78
+ orig_width, orig_height = image.size
79
+ aspect_ratio = orig_width / orig_height
80
+
81
+ # calculate the existing image aspect ratio
82
+ target_ratios = set(
83
+ (i, j)
84
+ for n in range(min_num, max_num + 1)
85
+ for i in range(1, n + 1)
86
+ for j in range(1, n + 1)
87
+ if i * j <= max_num and i * j >= min_num
88
+ )
89
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
90
+
91
+ # find the closest aspect ratio to the target
92
+ target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
93
+
94
+ # calculate the target width and height
95
+ target_width = image_size * target_aspect_ratio[0]
96
+ target_height = image_size * target_aspect_ratio[1]
97
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
98
+
99
+ # resize the image
100
+ resized_img = image.resize((target_width, target_height))
101
+ processed_images = []
102
+ for i in range(blocks):
103
+ box = (
104
+ (i % (target_width // image_size)) * image_size,
105
+ (i // (target_width // image_size)) * image_size,
106
+ ((i % (target_width // image_size)) + 1) * image_size,
107
+ ((i // (target_width // image_size)) + 1) * image_size,
108
+ )
109
+ # split the image
110
+ split_img = resized_img.crop(box)
111
+ processed_images.append(split_img)
112
+ assert len(processed_images) == blocks
113
+ if use_thumbnail and len(processed_images) != 1:
114
+ thumbnail_img = image.resize((image_size, image_size))
115
+ processed_images.append(thumbnail_img)
116
+ return processed_images
117
+
118
+
119
+ def split_model(model_name):
120
+ device_map = {}
121
+ world_size = torch.cuda.device_count()
122
+ num_layers = {
123
+ "InternVL2-1B": 24,
124
+ "InternVL2-2B": 24,
125
+ "InternVL2-4B": 32,
126
+ "InternVL2-8B": 32,
127
+ "InternVL2-26B": 48,
128
+ "InternVL2-40B": 60,
129
+ "InternVL2-Llama3-76B": 80,
130
+ }[model_name]
131
+ # Since the first GPU will be used for ViT, treat it as half a GPU.
132
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
133
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
134
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
135
+ layer_cnt = 0
136
+ for i, num_layer in enumerate(num_layers_per_gpu):
137
+ for j in range(num_layer):
138
+ device_map[f"language_model.model.layers.{layer_cnt}"] = i
139
+ layer_cnt += 1
140
+ device_map["vision_model"] = 0
141
+ device_map["mlp1"] = 0
142
+ device_map["language_model.model.tok_embeddings"] = 0
143
+ device_map["language_model.model.embed_tokens"] = 0
144
+ device_map["language_model.output"] = 0
145
+ device_map["language_model.model.norm"] = 0
146
+ device_map["language_model.lm_head"] = 0
147
+ device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
148
+
149
+ return device_map
150
+
151
+
152
+ def move_tensors_to_gpu(module):
153
+ for name, tensor in module.named_buffers():
154
+ if isinstance(tensor, torch.Tensor) and tensor.device.type == "cpu":
155
+ module.register_buffer(name, tensor.cuda(), persistent=False)
156
+ for _, param in module.named_parameters():
157
+ if param.device.type == "cpu":
158
+ param.data = param.data.cuda()
159
+
160
+
161
+ # video multi-round conversation (视频多轮对话)
162
+ def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
163
+ if bound:
164
+ start, end = bound[0], bound[1]
165
+ else:
166
+ start, end = -100000, 100000
167
+ start_idx = max(first_idx, round(start * fps))
168
+ end_idx = min(round(end * fps), max_frame)
169
+ seg_size = float(end_idx - start_idx) / num_segments
170
+ frame_indices = np.array(
171
+ [int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)]
172
+ )
173
+ return frame_indices
174
+
175
+
176
+ def load_video_from_seq_of_frames(
177
+ seq_of_frames: list[np.ndarray],
178
+ input_size=448,
179
+ max_num=1,
180
+ device="cuda",
181
+ dtype=torch.bfloat16,
182
+ ):
183
+ pixel_values_list, num_patches_list = [], []
184
+ transform = build_transform(input_size=input_size)
185
+ for img in seq_of_frames:
186
+ img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
187
+ pixel_values = [transform(tile) for tile in img]
188
+ pixel_values = torch.stack(pixel_values).to(dtype=dtype, device=device) # Convert to bfloat16
189
+ num_patches_list.append(pixel_values.shape[0])
190
+ pixel_values_list.append(pixel_values)
191
+ return torch.cat(pixel_values_list), num_patches_list
192
+
193
+
194
+ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
195
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
196
+ max_frame = len(vr) - 1
197
+ fps = float(vr.get_avg_fps())
198
+
199
+ pixel_values_list, num_patches_list = [], []
200
+ transform = build_transform(input_size=input_size)
201
+ frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
202
+ for frame_index in frame_indices:
203
+ img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
204
+ img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
205
+ pixel_values = [transform(tile) for tile in img]
206
+ pixel_values = torch.stack(pixel_values)
207
+ num_patches_list.append(pixel_values.shape[0])
208
+ pixel_values_list.append(pixel_values.to(torch.bfloat16))
209
+ pixel_values = torch.cat(pixel_values_list)
210
+ return pixel_values, num_patches_list
ns_vfs/vlm/obj.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ import logging
3
+
4
+ class DetectedObject:
5
+ """Detected Object class."""
6
+ def __init__(self,
7
+ name: str,
8
+ is_detected: bool,
9
+ confidence: float,
10
+ probability: float,
11
+ model_name: str | None = None,
12
+ bounding_box_of_all_obj: list[Any] | None = None):
13
+ self.name = name
14
+ self.confidence = confidence
15
+ self.probability = probability
16
+ self.is_detected = is_detected
17
+ self.model_name = model_name
18
+ self.bounding_box_of_all_obj = bounding_box_of_all_obj
19
+
20
+ def __str__(self) -> str:
21
+ return f"Object: {self.name}, Detected: {self.is_detected}, Probability: {self.get_detected_probability()}"
22
+
23
+ def get_detected_probability(self) -> float:
24
+ if not self.is_detected:
25
+ return 0
26
+ if self.probability > 0:
27
+ return self.probability
28
+ if self.confidence > 0 and self.probability == 0:
29
+ logging.info("Probability is not set, using confidence: %f", self.confidence)
30
+ return self.confidence
31
+ return self.probability
ns_vfs/vlm/vllm_client.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+
3
+ from openai import OpenAI
4
+ import numpy as np
5
+ import math
6
+ import cv2
7
+
8
+
9
+ from ns_vfs.vlm.obj import DetectedObject
10
+
11
+ class VLLMClient:
12
+ def __init__(
13
+ self,
14
+ api_key="EMPTY",
15
+ api_base="http://localhost:8000/v1",
16
+ model="OpenGVLab/InternVL2-8B",
17
+ # model="Qwen/Qwen2.5-VL-7B-Instruct",
18
+ ):
19
+ self.client = OpenAI(api_key=api_key, base_url=api_base)
20
+ self.model = model
21
+
22
+ # def _encode_frame(self, frame):
23
+ # return base64.b64encode(frame.tobytes()).decode("utf-8")
24
+ def _encode_frame(self, frame):
25
+ # Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.
26
+ ret, buffer = cv2.imencode(".jpg", frame)
27
+ if not ret:
28
+ raise ValueError("Could not encode frame")
29
+ return base64.b64encode(buffer).decode("utf-8")
30
+
31
+ def detect(
32
+ self,
33
+ seq_of_frames: list[np.ndarray],
34
+ scene_description: str,
35
+ threshold: float
36
+ ) -> DetectedObject:
37
+
38
+ parsing_rule = "You must only return a Yes or No, and not both, to any question asked. You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times. For example, if the question is \"Is there a cat present in the sequence of images?\", the answer must only be 'Yes' or 'No'."
39
+ prompt = rf"Is there a {scene_description} present in the sequence of images? " f"\n[PARSING RULE]: {parsing_rule}"
40
+
41
+ # Encode each frame.
42
+ encoded_images = [self._encode_frame(frame) for frame in seq_of_frames]
43
+
44
+ # Build the user message: a text prompt plus one image for each frame.
45
+ user_content = [
46
+ {
47
+ "type": "text",
48
+ "text": f"The following is the sequence of images",
49
+ }
50
+ ]
51
+ for encoded in encoded_images:
52
+ user_content.append(
53
+ {
54
+ "type": "image_url",
55
+ "image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
56
+ }
57
+ )
58
+
59
+ # Create a chat completion request.
60
+ chat_response = self.client.chat.completions.create(
61
+ model=self.model,
62
+ messages=[
63
+ {"role": "system", "content": prompt},
64
+ {"role": "user", "content": user_content},
65
+ ],
66
+ max_tokens=1,
67
+ temperature=0.0,
68
+ logprobs=True,
69
+ top_logprobs=20,
70
+ )
71
+ content = chat_response.choices[0].message.content
72
+ is_detected = "yes" in content.lower()
73
+
74
+ # Retrieve the list of TopLogprob objects.
75
+ top_logprobs_list = chat_response.choices[0].logprobs.content[0].top_logprobs
76
+
77
+ # Build a mapping from token text (stripped) to its probability.
78
+ token_prob_map = {}
79
+ for top_logprob in top_logprobs_list:
80
+ token_text = top_logprob.token.strip()
81
+ token_prob_map[token_text] = np.exp(top_logprob.logprob)
82
+
83
+ # Extract probabilities for "Yes" and "No"
84
+ yes_prob = token_prob_map.get("Yes", 0.0)
85
+ no_prob = token_prob_map.get("No", 0.0)
86
+
87
+ # Compute the normalized probability for "Yes": p_yes / (p_yes + p_no)
88
+ if yes_prob + no_prob > 0:
89
+ confidence = yes_prob / (yes_prob + no_prob)
90
+ else:
91
+ raise ValueError("No probabilities for 'Yes' or 'No' found in the response.")
92
+
93
+ # print(f"Is detected: {is_detected}")
94
+ # print(f"Confidence: {confidence:.3f}")
95
+
96
+
97
+ probability = self.calibrate(confidence=confidence, false_threshold=threshold)
98
+
99
+ return DetectedObject(
100
+ name=scene_description,
101
+ is_detected=is_detected,
102
+ confidence=round(confidence, 3),
103
+ probability=round(probability, 3)
104
+ )
105
+
106
+ def calibrate(
107
+ self,
108
+ confidence: float,
109
+ true_threshold=0.95,
110
+ false_threshold=0.40,
111
+ target_conf=0.60,
112
+ target_prob=0.78,
113
+ p_min=0.01,
114
+ p_max=0.99,
115
+ steepness_factor=0.7,
116
+ ) -> float:
117
+ """Map confidence to probability using a sigmoid function with adjustable steepness.
118
+
119
+ Args:
120
+ confidence: Input confidence score
121
+ true_threshold: Upper threshold
122
+ false_threshold: Lower threshold
123
+ target_conf: Target confidence point
124
+ target_prob: Target probability value
125
+ p_min: Minimum probability
126
+ p_max: Maximum probability
127
+ steepness_factor: Controls curve steepness (0-1, lower = less steep)
128
+ """
129
+ if confidence <= false_threshold:
130
+ return p_min
131
+
132
+ if confidence >= true_threshold:
133
+ return p_max
134
+
135
+ # Calculate parameters to ensure target_conf maps to target_prob
136
+ # For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0)))
137
+
138
+ # First, normalize the target point
139
+ x_norm = (target_conf - false_threshold) / (true_threshold - false_threshold)
140
+ y_norm = (target_prob - p_min) / (p_max - p_min)
141
+
142
+ # Find x0 (midpoint) and k (steepness) to satisfy our target point
143
+ x0 = 0.30 # Midpoint of normalized range
144
+
145
+ # Calculate base k value to hit the target point
146
+ base_k = -np.log(1 / y_norm - 1) / (x_norm - x0)
147
+
148
+ # Apply steepness factor (lower = less steep)
149
+ k = base_k * steepness_factor
150
+
151
+ # With reduced steepness, we need to adjust x0 to still hit the target point
152
+ # Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k
153
+ adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k
154
+
155
+ # Apply the sigmoid with our calculated parameters
156
+ x_scaled = (confidence - false_threshold) / (true_threshold - false_threshold)
157
+ sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0)))
158
+
159
+ # Ensure we still hit exactly p_min and p_max at the thresholds
160
+ # by rescaling the output slightly
161
+ min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0)))
162
+ max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0)))
163
+
164
+ # Normalize the output
165
+ normalized = (sigmoid_value - min_val) / (max_val - min_val)
166
+
167
+ return p_min + normalized * (p_max - p_min)
168
+
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "nsvs"
3
+ version = "0.1.0"
4
+ requires-python = ">=3.13"
5
+ dependencies = [
6
+ "accelerate>=1.10.1",
7
+ "cmake==3.29.6",
8
+ "decord>=0.6.0",
9
+ "einops>=0.8.1",
10
+ "gradio>=5.44.1",
11
+ "ninja>=1.13.0",
12
+ "numpy>=2.3.2",
13
+ "openai>=1.106.1",
14
+ "opencv-python>=4.11.0.86",
15
+ "safetensors>=0.6.2",
16
+ "sentencepiece>=0.2.1",
17
+ "stormpy>=1.10.1",
18
+ "timm>=1.0.19",
19
+ "tqdm>=4.67.1",
20
+ "transformers>=4.41,<4.47",
21
+ ]
22
+
scripts/no_nsvs.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from tqdm import tqdm
3
+ import json
4
+ import time
5
+ import os
6
+
7
+ from ns_vfs.vlm.internvl import InternVL
8
+ from ns_vfs.video.read_tlv import TLVReader
9
+
10
+ class RunConfig(Enum):
11
+ SLIDING_WINDOW = "sliding_window"
12
+ FRAME_WISE = "frame_wise"
13
+ CURRENT_CONFIG = RunConfig.SLIDING_WINDOW
14
+ TLV_PATH = "/nas/dataset/tlv-dataset-v1"
15
+ MODEL_NAME = "InternVL2-8B"
16
+ DEVICE = 7 # GPU device index
17
+
18
+ CALIBRATION_THRESHOLD = 0.349 # vllm threshold
19
+ THRESHOLD = 0.5 # detection threshold (fw)
20
+ STRIDE = 10 # slide stride (sw)
21
+ WINDOW = 20 # window length (sw)
22
+
23
+ def sliding_window(entry): # answers "which sequence of `WINDOW` frames can best answer the query"
24
+ query = entry["tl"]["query"]
25
+ frames = entry["images"]
26
+
27
+ model = InternVL(model_name=MODEL_NAME, device=DEVICE)
28
+ best = {"prob": -1.0, "start": 1, "end": 1}
29
+ foi = []
30
+
31
+ t = 0
32
+ windows = list(range(0, len(frames), STRIDE))
33
+ with tqdm(windows, desc=f"Sliding window (stride={STRIDE}, window={WINDOW})") as pbar:
34
+ for t in pbar:
35
+ end_idx = min(t + WINDOW, len(frames))
36
+ seq = frames[t:end_idx]
37
+
38
+ detect = model.detect(seq, query, CALIBRATION_THRESHOLD)
39
+ prob = detect.probability
40
+ is_detected = detect.is_detected
41
+
42
+ pbar.set_postfix( {"best_prob": f"{best['prob']:.3f}", "current_prob": f"{prob:.3f}", "detected": is_detected} )
43
+
44
+ if prob > best["prob"] and is_detected:
45
+ best.update({"prob": prob, "start": t, "end": end_idx})
46
+
47
+ if best["prob"] != -1.0:
48
+ foi = list(range(best["start"], best["end"] + 1))
49
+
50
+ return foi
51
+
52
+ def frame_wise(entry):
53
+ query = entry["tl"]["query"]
54
+ frames = entry["images"]
55
+
56
+ model = InternVL(model_name="InternVL2-8B", device=DEVICE)
57
+ foi = []
58
+
59
+ t = 0
60
+ windows = range(len(frames))
61
+ with tqdm(windows, desc=f"Framewise (threshold={THRESHOLD}") as pbar:
62
+ for t in pbar:
63
+ f = [frames[t]]
64
+
65
+ detect = model.detect(f, query, CALIBRATION_THRESHOLD)
66
+ prob = detect.probability
67
+ is_detected = detect.is_detected
68
+
69
+ pbar.set_postfix( {"current_prob": f"{prob:.3f}", "detected": is_detected} )
70
+
71
+ if prob > THRESHOLD and is_detected:
72
+ foi.append(t)
73
+
74
+ return foi
75
+
76
+
77
+ def main():
78
+ reader = TLVReader(TLV_PATH)
79
+ data = reader.read_video()
80
+ if not data:
81
+ return
82
+
83
+ folder_name = f"{MODEL_NAME}_{CURRENT_CONFIG.value}"
84
+ folder_name = os.path.join("/nas/mars/experiment_result/nsvs/nsvs2-prelims", folder_name)
85
+ if not os.path.exists(folder_name):
86
+ os.makedirs(folder_name)
87
+
88
+ with tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
89
+ for i, entry in pbar:
90
+ start_time = time.time()
91
+ if CURRENT_CONFIG == RunConfig.SLIDING_WINDOW:
92
+ foi = sliding_window(entry)
93
+ else:
94
+ foi = frame_wise(entry)
95
+ end_time = time.time()
96
+
97
+ output = {
98
+ "propositions": entry["tl"]["propositions"],
99
+ "specification": entry["tl"]["specification"],
100
+ "ground_truth": entry["metadata"]["ground_truth"],
101
+ "frames_of_interest": foi,
102
+ "type": entry["metadata"]["type"],
103
+ "number_of_frames": entry["video_info"].frame_count,
104
+ "processting_time_seconds": round(end_time - start_time, 3),
105
+ }
106
+
107
+ with open(f"{folder_name}/output_{i}.json", "w") as f:
108
+ json.dump(output, f, indent=4)
109
+
110
+
111
+ if __name__ == "__main__":
112
+ main()
scripts/plot.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.interpolate import make_interp_spline
2
+ import matplotlib.colors as mcolors
3
+ from collections import defaultdict
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import json
7
+ import os
8
+
9
+
10
+ folder1 = "/nas/mars/experiment_result/nsvs/nsvs2-prelims/nsvs"
11
+ folder2 = "/nas/mars/experiment_result/nsvs/nsvs2-prelims/InternVL2-8B_frame_wise"
12
+ folder3 = "/nas/mars/experiment_result/nsvs/nsvs2-prelims/InternVL2-8B_sliding_window"
13
+
14
+ out_path_duration = "scripts/plots/plot_duration.png"
15
+ out_path_complexity = "scripts/plots/plot_complexity.png"
16
+
17
+ labels = ["NSVS", "Frame-Wise", "Sliding-Window"]
18
+ colors = ["#1f77b4", "#b4421f", "#2ca02c"]
19
+
20
+ complexity_bins = [1, 2, 3]
21
+
22
+
23
+ def _safe_json_load(path):
24
+ with open(path, "r") as f:
25
+ return json.load(f)
26
+
27
+ def _per_file_stats(pred, gt):
28
+ tp = len(pred & gt)
29
+ fp = len(pred - gt)
30
+ fn = len(gt - pred)
31
+ precision_f = tp / (tp + fp) if (tp + fp) else 0.0
32
+ recall_f = tp / (tp + fn) if (tp + fn) else 0.0
33
+ f1_file = (2 * precision_f * recall_f / (precision_f + recall_f)
34
+ if (precision_f + recall_f) else 0.0)
35
+ return tp, fp, fn, precision_f, recall_f, f1_file
36
+
37
+ def _iter_json(folder):
38
+ for fname in os.listdir(folder):
39
+ if fname.endswith(".json"):
40
+ yield os.path.join(folder, fname)
41
+
42
+ def compute_statistics(folders):
43
+ out = {}
44
+ for folder in folders:
45
+ TP = FP = FN = 0
46
+ per_file_f1 = []
47
+
48
+ for fpath in _iter_json(folder):
49
+ data = _safe_json_load(fpath)
50
+ pred = set(map(int, data.get("frames_of_interest", [])))
51
+ gt = set(map(int, data.get("ground_truth", [])))
52
+
53
+ tp, fp, fn, _, _, f1_file = _per_file_stats(pred, gt)
54
+ TP += tp; FP += fp; FN += fn
55
+ per_file_f1.append(float(f1_file))
56
+
57
+ precision = TP / (TP + FP) if (TP + FP) else 0.0
58
+ recall = TP / (TP + FN) if (TP + FN) else 0.0
59
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
60
+
61
+ out[folder] = {
62
+ "precision": precision,
63
+ "recall": recall,
64
+ "f1": f1,
65
+ "TP": TP, "FP": FP, "FN": FN,
66
+ "per_file_f1": per_file_f1,
67
+ }
68
+
69
+ # Pretty print
70
+ for folder, stats in out.items():
71
+ print(f"[{folder}] Overall metrics:")
72
+ print(f" Precision: {stats['precision']:.4f}")
73
+ print(f" Recall: {stats['recall']:.4f}")
74
+ print(f" F1: {stats['f1']:.4f}\n")
75
+
76
+ return out
77
+
78
+
79
+ def _collect_duration_points(folder):
80
+ xs, ys = [], []
81
+ for fpath in _iter_json(folder):
82
+ data = _safe_json_load(fpath)
83
+ pred = set(map(int, data.get("frames_of_interest", [])))
84
+ gt = set(map(int, data.get("ground_truth", [])))
85
+ tp, fp, fn, _, _, f1 = _per_file_stats(pred, gt)
86
+
87
+ nframes = int(data.get("number_of_frames", 0))
88
+ if nframes <= 0:
89
+ continue
90
+ minutes = int(nframes / 4) # your original definition
91
+ xs.append(minutes)
92
+ ys.append(float(f1))
93
+ return np.array(xs, dtype=int), np.array(ys, dtype=float)
94
+
95
+ def _compute_envelope(xs, ys, bandwidth=20, smooth_band=600, smooth_center=11):
96
+ if len(xs) == 0:
97
+ return None
98
+
99
+ grouped = defaultdict(list)
100
+ for x, y in zip(xs, ys):
101
+ grouped[int(x)].append(float(y))
102
+
103
+ durations = np.array(sorted(grouped.keys()))
104
+ if len(durations) == 0:
105
+ return None
106
+
107
+ ymin_raw = np.array([min(grouped[d]) for d in durations])
108
+ ymax_raw = np.array([max(grouped[d]) for d in durations])
109
+ ymid_raw = (ymin_raw + ymax_raw) / 2.0
110
+
111
+ ymin_s, ymax_s = [], []
112
+ for d in durations:
113
+ mask = np.abs(durations - d) <= bandwidth
114
+ ymin_s.append(ymin_raw[mask].min())
115
+ ymax_s.append(ymax_raw[mask].max())
116
+ ymin_s, ymax_s = np.array(ymin_s), np.array(ymax_s)
117
+ ymid_s = (ymin_s + ymax_s) / 2.0
118
+
119
+ if len(durations) >= 4:
120
+ x_band = np.linspace(durations.min(), durations.max(), smooth_band)
121
+ ymin_smooth = make_interp_spline(durations, ymin_s, k=3)(x_band)
122
+ ymax_smooth = make_interp_spline(durations, ymax_s, k=3)(x_band)
123
+
124
+ x_center = np.linspace(durations.min(), durations.max(), smooth_center)
125
+ ymid_smooth = make_interp_spline(durations, ymid_s, k=3)(x_center)
126
+ else:
127
+ x_band, ymin_smooth, ymax_smooth = durations, ymin_s, ymax_s
128
+ x_center, ymid_smooth = durations, ymid_s
129
+
130
+ return x_band, ymin_smooth, ymax_smooth, x_center, ymid_smooth
131
+
132
+ def _shrink_band(ymin_spline, ymax_spline, factor=0.5):
133
+ center = (ymin_spline + ymax_spline) / 2.0
134
+ ymin_new = center - factor * (center - ymin_spline)
135
+ ymax_new = center + factor * (ymax_spline - center)
136
+ return ymin_new, ymax_new, center
137
+
138
+ def plot_duration(folders, labels, colors, out_path):
139
+ envs = []
140
+ for folder in folders:
141
+ xs, ys = _collect_duration_points(folder)
142
+ envs.append(_compute_envelope(xs, ys))
143
+
144
+ if all(env is None for env in envs):
145
+ print("Not enough data with valid 'number_of_frames' to plot.")
146
+ return
147
+
148
+ fig, ax = plt.subplots(figsize=(9, 6))
149
+
150
+ for env, lab, col in zip(envs, labels, colors):
151
+ if env is None:
152
+ continue
153
+ x_band, ymin_s, ymax_s, x_center, ymid_s = env
154
+ ymin_plot, ymax_plot, _ = _shrink_band(ymin_s, ymax_s, factor=0.5)
155
+
156
+ base = mcolors.to_rgb(col)
157
+ darker = tuple(max(0.0, c * 0.75) for c in base)
158
+
159
+ ax.fill_between(x_band, ymin_plot, ymax_plot, color=base, alpha=0.22)
160
+ ax.plot(x_center, ymid_s, linewidth=2.5, color=darker, label=lab)
161
+
162
+ ax.set_xlabel("Minutes", fontsize=17)
163
+ ax.set_ylabel("F1 Score", fontsize=17)
164
+ ax.tick_params(axis="both", labelsize=15)
165
+ ax.grid(True, linestyle="--", alpha=0.4)
166
+ ax.legend(fontsize=15)
167
+
168
+ fig.tight_layout()
169
+ fig.savefig(out_path, dpi=200)
170
+ plt.close(fig)
171
+
172
+
173
+ def _normalize_prop_names(prop_obj):
174
+ if isinstance(prop_obj, dict):
175
+ return {str(k) for k in prop_obj.keys() if str(k).strip()}
176
+ elif isinstance(prop_obj, (list, tuple, set)):
177
+ flat = []
178
+ for item in prop_obj:
179
+ if isinstance(item, (list, tuple, set)):
180
+ flat.extend(item)
181
+ else:
182
+ flat.append(item)
183
+ return {str(x) for x in flat if str(x).strip()}
184
+ elif prop_obj:
185
+ return {str(prop_obj)}
186
+ return set()
187
+
188
+ def _complexity_by_props(folder, bins):
189
+ by_props = defaultdict(list)
190
+ for fpath in _iter_json(folder):
191
+ data = _safe_json_load(fpath)
192
+ pred = set(map(int, data.get("frames_of_interest", [])))
193
+ gt = set(map(int, data.get("ground_truth", [])))
194
+ tp, fp, fn, _, _, f1_file = _per_file_stats(pred, gt)
195
+
196
+ prop_names = _normalize_prop_names(data.get("propositions", []))
197
+ n_props = len(prop_names)
198
+ if n_props in bins:
199
+ by_props[n_props].append(float(f1_file))
200
+ return by_props
201
+
202
+ def plot_complexity(folders, labels, colors, bins, out_path):
203
+ all_by_props = [ _complexity_by_props(f, bins) for f in folders ]
204
+
205
+ width = 0.25
206
+ offsets = [-(width), 0.0, width] # for three models
207
+ fig, ax = plt.subplots(figsize=(9, 6))
208
+
209
+ handles = []
210
+ for idx, (by_props, lab, col, off) in enumerate(zip(all_by_props, labels, colors, offsets)):
211
+ positions = [p + off for p in bins]
212
+ data = [by_props.get(k, []) for k in bins]
213
+
214
+ bp = ax.boxplot(
215
+ data,
216
+ positions=positions,
217
+ widths=width * 0.9,
218
+ patch_artist=True,
219
+ showfliers=False,
220
+ )
221
+
222
+ for box in bp['boxes']:
223
+ box.set_facecolor(col)
224
+ box.set_alpha(0.35)
225
+ box.set_edgecolor(col)
226
+ box.set_linewidth(1.5)
227
+ for element in ['whiskers', 'caps', 'medians']:
228
+ for artist in bp[element]:
229
+ artist.set_color(col)
230
+ artist.set_linewidth(1.5)
231
+
232
+ handles.append(bp["boxes"][0])
233
+
234
+ ax.set_xticks(bins)
235
+ ax.set_xticklabels([p for p in bins], fontsize=15)
236
+ ax.set_xlabel("Number of Propositions", fontsize=17)
237
+ ax.set_ylabel("F1 Score", fontsize=17)
238
+ ax.tick_params(axis="y", labelsize=15)
239
+ ax.grid(True, linestyle="--", alpha=0.4)
240
+ ax.legend(handles, labels, fontsize=13)
241
+
242
+ fig.tight_layout()
243
+ fig.savefig(out_path, dpi=200)
244
+ plt.close(fig)
245
+
246
+
247
+ if __name__ == "__main__":
248
+ folders = [folder1, folder2, folder3]
249
+ compute_statistics(folders)
250
+ plot_duration(folders, labels, colors, out_path_duration)
251
+ plot_complexity(folders, labels, colors, complexity_bins, out_path_complexity)
252
+
scripts/plots/plot_complexity.png ADDED

Git LFS Details

  • SHA256: a95c67486c6fdeb5a8ac94a484ee4c1bb79233c40fa285c2cb44ccf07e8f9b8f
  • Pointer size: 130 Bytes
  • Size of remote file: 77.8 kB
scripts/plots/plot_duration.png ADDED

Git LFS Details

  • SHA256: f5ea7015951a7132ac7b23b847484ceaf7723f471a3e254922f6aa7a1707193b
  • Pointer size: 131 Bytes
  • Size of remote file: 184 kB
vllm_serve.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # MODEL="Qwen/Qwen2.5-VL-7B-Instruct"
4
+ # MODEL="OpenGVLab/InternVL2-8B",
5
+ # export CUDA_DEVICE_ORDER="PCI_BUS_ID"
6
+ # export NCCL_P2P_DISABLE=1
7
+ # export CUDA_VISIBLE_DEVICES="0"
8
+ # export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
9
+ PORT=8000
10
+ vllm serve "OpenGVLab/InternVL2-8B" \
11
+ --port 8000 \
12
+ --trust-remote-code \
13
+ --limit-mm-per-prompt image=4 \
14
+ # --max-model-len 8192 \
15
+ # --gpu-memory-utilization 0.97 \
16
+ --disable-log-requests