Spaces:
Running
on
Zero
Running
on
Zero
jmeyer1980
commited on
Commit
·
2133289
1
Parent(s):
9380cea
Fix syntax errors in app.py preventing Gradio app startup on HF Spaces
Browse files- Fix unterminated f-string literals that were split across multiple lines
- Consolidate f-strings onto single lines to resolve SyntaxError
- This restores the interactive UI for the Warbler CDA demo
- app.py +31 -53
- convert_to_jsonl.py +14 -12
- test_embedding_integration.py +16 -13
- warbler_cda/api/cli.py +48 -73
- warbler_cda/api/service.py +49 -56
- warbler_cda/conflict_detector.py +39 -64
- warbler_cda/evaporation.py +39 -76
- warbler_cda/pack_loader.py +25 -36
- warbler_cda/retrieval_api.py +66 -120
- warbler_cda/stat7_entity.py +11 -28
- warbler_cda/stat7_experiments.py +22 -30
- warbler_cda/stat7_rag_bridge.py +19 -33
- warbler_cda/utils/load_warbler_packs.py +12 -24
app.py
CHANGED
|
@@ -66,12 +66,9 @@ class PackManager:
|
|
| 66 |
self.cache_dir = Path.home() / ".warbler_cda" / "cache"
|
| 67 |
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
| 68 |
self.metadata_file = self.cache_dir / "pack_metadata.json"
|
| 69 |
-
self.skip_cache = os.getenv(
|
| 70 |
-
|
| 71 |
-
self.
|
| 72 |
-
"WARBLER_SAMPLE_ONLY", "").lower() == "true"
|
| 73 |
-
self.ingest_packs = os.getenv(
|
| 74 |
-
"WARBLER_INGEST_PACKS", "true").lower() == "true"
|
| 75 |
|
| 76 |
def _load_metadata(self) -> Optional[Dict]:
|
| 77 |
if not self.metadata_file.exists():
|
|
@@ -146,8 +143,7 @@ api = None
|
|
| 146 |
if WARBLER_AVAILABLE:
|
| 147 |
try:
|
| 148 |
embedding_provider = EmbeddingProviderFactory.get_default_provider()
|
| 149 |
-
semantic_anchors = SemanticAnchorGraph(
|
| 150 |
-
embedding_provider=embedding_provider)
|
| 151 |
stat7_bridge = STAT7RAGBridge()
|
| 152 |
|
| 153 |
api = RetrievalAPI(
|
|
@@ -176,31 +172,24 @@ if WARBLER_AVAILABLE:
|
|
| 176 |
pack_loader = PackLoader()
|
| 177 |
pack_docs = pack_loader.discover_documents()
|
| 178 |
|
| 179 |
-
if pack_docs and pack_manager.should_ingest_packs(
|
| 180 |
-
|
| 181 |
-
print(
|
| 182 |
-
f"[INFO] Ingesting {len(pack_docs)} documents from Warbler packs...")
|
| 183 |
for doc in pack_docs:
|
| 184 |
-
success = api.add_document(
|
| 185 |
-
doc["id"], doc["content"], doc["metadata"])
|
| 186 |
if not success:
|
| 187 |
print(f"[WARN] Failed to add document {doc['id']}")
|
| 188 |
packs_loaded = len(pack_docs)
|
| 189 |
pack_manager.mark_packs_ingested(1, packs_loaded)
|
| 190 |
-
print(
|
| 191 |
-
f"[OK] Loaded {packs_loaded} documents from Warbler packs")
|
| 192 |
|
| 193 |
elif pack_docs:
|
| 194 |
packs_loaded = len(pack_docs)
|
| 195 |
-
print(
|
| 196 |
-
f"[INFO] Using cached pack data ({packs_loaded} documents)")
|
| 197 |
|
| 198 |
else:
|
| 199 |
-
print(
|
| 200 |
-
"[INFO] No Warbler packs found. Using sample documents instead.")
|
| 201 |
for doc in SAMPLE_DOCS:
|
| 202 |
-
api.add_document(
|
| 203 |
-
doc["id"], doc["content"], doc["metadata"])
|
| 204 |
packs_loaded = len(SAMPLE_DOCS)
|
| 205 |
print(f"[OK] Loaded {packs_loaded} sample documents")
|
| 206 |
|
|
@@ -252,14 +241,14 @@ def query_warbler(
|
|
| 252 |
weight_stat7=weight_stat7,
|
| 253 |
)
|
| 254 |
|
| 255 |
-
print(
|
| 256 |
-
f"DEBUG: Query created - ID: {query.query_id}, Text: {query_text}")
|
| 257 |
|
| 258 |
# Execute query
|
| 259 |
assembly = api.retrieve_context(query)
|
| 260 |
|
| 261 |
print(
|
| 262 |
-
f"DEBUG: Retrieved {len(assembly.results)} results, Assembly ID: {assembly.assembly_id}"
|
|
|
|
| 263 |
|
| 264 |
elapsed_ms = (time.time() - start_time) * 1000
|
| 265 |
|
|
@@ -323,11 +312,7 @@ def query_warbler(
|
|
| 323 |
return f"Error: {str(e)}", json.dumps({"error": str(e)}, indent=2)
|
| 324 |
|
| 325 |
|
| 326 |
-
def add_document(
|
| 327 |
-
doc_id: str,
|
| 328 |
-
content: str,
|
| 329 |
-
realm_type: str,
|
| 330 |
-
realm_label: str) -> str:
|
| 331 |
"""Add a new document to the system"""
|
| 332 |
if not WARBLER_AVAILABLE or not api:
|
| 333 |
return "Warbler CDA not available."
|
|
@@ -377,8 +362,7 @@ def get_system_stats() -> str:
|
|
| 377 |
metrics['retrieval_metrics']['hybrid_queries']}\n\n"
|
| 378 |
|
| 379 |
stats += "## Quality Distribution\n\n"
|
| 380 |
-
for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items(
|
| 381 |
-
):
|
| 382 |
stats += f"- {quality.capitalize()}: {count}\n"
|
| 383 |
|
| 384 |
return stats
|
|
@@ -413,19 +397,17 @@ with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as
|
|
| 413 |
|
| 414 |
with gr.Row():
|
| 415 |
max_results = gr.Slider(
|
| 416 |
-
minimum=1, maximum=10, value=5, step=1, label="Max Results"
|
| 417 |
-
|
| 418 |
-
|
| 419 |
|
| 420 |
with gr.Row():
|
| 421 |
weight_semantic = gr.Slider(
|
| 422 |
-
minimum=0.0, maximum=1.0, value=0.6, step=0.1, label="Semantic Weight"
|
|
|
|
| 423 |
weight_stat7 = gr.Slider(
|
| 424 |
-
minimum=0.0,
|
| 425 |
-
|
| 426 |
-
value=0.4,
|
| 427 |
-
step=0.1,
|
| 428 |
-
label="STAT7 Weight")
|
| 429 |
|
| 430 |
query_btn = gr.Button("Search", variant="primary")
|
| 431 |
|
|
@@ -449,38 +431,34 @@ with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as
|
|
| 449 |
|
| 450 |
query_btn.click(
|
| 451 |
fn=query_warbler,
|
| 452 |
-
inputs=[query_input, max_results, use_hybrid,
|
| 453 |
-
weight_semantic, weight_stat7],
|
| 454 |
outputs=[results_output, metrics_output],
|
| 455 |
)
|
| 456 |
|
| 457 |
with gr.Tab("Add Document"):
|
| 458 |
with gr.Row():
|
| 459 |
with gr.Column():
|
| 460 |
-
doc_id_input = gr.Textbox(
|
| 461 |
-
label="Document ID", placeholder="unique_doc_id")
|
| 462 |
content_input = gr.Textbox(
|
| 463 |
-
label="Content",
|
| 464 |
-
|
| 465 |
-
lines=5)
|
| 466 |
|
| 467 |
with gr.Row():
|
| 468 |
realm_type_input = gr.Dropdown(
|
| 469 |
-
choices=["wisdom", "technical",
|
| 470 |
-
"narrative", "pattern", "data"],
|
| 471 |
value="wisdom",
|
| 472 |
label="Realm Type",
|
| 473 |
)
|
| 474 |
realm_label_input = gr.Textbox(
|
| 475 |
-
label="Realm Label", placeholder="e.g., philosophy, documentation"
|
|
|
|
| 476 |
|
| 477 |
add_btn = gr.Button("Add Document", variant="primary")
|
| 478 |
add_output = gr.Textbox(label="Status", lines=3)
|
| 479 |
|
| 480 |
add_btn.click(
|
| 481 |
fn=add_document,
|
| 482 |
-
inputs=[doc_id_input, content_input,
|
| 483 |
-
realm_type_input, realm_label_input],
|
| 484 |
outputs=add_output,
|
| 485 |
)
|
| 486 |
|
|
|
|
| 66 |
self.cache_dir = Path.home() / ".warbler_cda" / "cache"
|
| 67 |
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
| 68 |
self.metadata_file = self.cache_dir / "pack_metadata.json"
|
| 69 |
+
self.skip_cache = os.getenv("WARBLER_SKIP_PACK_CACHE", "").lower() == "true"
|
| 70 |
+
self.sample_only = os.getenv("WARBLER_SAMPLE_ONLY", "").lower() == "true"
|
| 71 |
+
self.ingest_packs = os.getenv("WARBLER_INGEST_PACKS", "true").lower() == "true"
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
def _load_metadata(self) -> Optional[Dict]:
|
| 74 |
if not self.metadata_file.exists():
|
|
|
|
| 143 |
if WARBLER_AVAILABLE:
|
| 144 |
try:
|
| 145 |
embedding_provider = EmbeddingProviderFactory.get_default_provider()
|
| 146 |
+
semantic_anchors = SemanticAnchorGraph(embedding_provider=embedding_provider)
|
|
|
|
| 147 |
stat7_bridge = STAT7RAGBridge()
|
| 148 |
|
| 149 |
api = RetrievalAPI(
|
|
|
|
| 172 |
pack_loader = PackLoader()
|
| 173 |
pack_docs = pack_loader.discover_documents()
|
| 174 |
|
| 175 |
+
if pack_docs and pack_manager.should_ingest_packs(api, len(pack_docs)):
|
| 176 |
+
print(f"[INFO] Ingesting {len(pack_docs)} documents from Warbler packs...")
|
|
|
|
|
|
|
| 177 |
for doc in pack_docs:
|
| 178 |
+
success = api.add_document(doc["id"], doc["content"], doc["metadata"])
|
|
|
|
| 179 |
if not success:
|
| 180 |
print(f"[WARN] Failed to add document {doc['id']}")
|
| 181 |
packs_loaded = len(pack_docs)
|
| 182 |
pack_manager.mark_packs_ingested(1, packs_loaded)
|
| 183 |
+
print(f"[OK] Loaded {packs_loaded} documents from Warbler packs")
|
|
|
|
| 184 |
|
| 185 |
elif pack_docs:
|
| 186 |
packs_loaded = len(pack_docs)
|
| 187 |
+
print(f"[INFO] Using cached pack data ({packs_loaded} documents)")
|
|
|
|
| 188 |
|
| 189 |
else:
|
| 190 |
+
print("[INFO] No Warbler packs found. Using sample documents instead.")
|
|
|
|
| 191 |
for doc in SAMPLE_DOCS:
|
| 192 |
+
api.add_document(doc["id"], doc["content"], doc["metadata"])
|
|
|
|
| 193 |
packs_loaded = len(SAMPLE_DOCS)
|
| 194 |
print(f"[OK] Loaded {packs_loaded} sample documents")
|
| 195 |
|
|
|
|
| 241 |
weight_stat7=weight_stat7,
|
| 242 |
)
|
| 243 |
|
| 244 |
+
print(f"DEBUG: Query created - ID: {query.query_id}, Text: {query_text}")
|
|
|
|
| 245 |
|
| 246 |
# Execute query
|
| 247 |
assembly = api.retrieve_context(query)
|
| 248 |
|
| 249 |
print(
|
| 250 |
+
f"DEBUG: Retrieved {len(assembly.results)} results, Assembly ID: {assembly.assembly_id}"
|
| 251 |
+
)
|
| 252 |
|
| 253 |
elapsed_ms = (time.time() - start_time) * 1000
|
| 254 |
|
|
|
|
| 312 |
return f"Error: {str(e)}", json.dumps({"error": str(e)}, indent=2)
|
| 313 |
|
| 314 |
|
| 315 |
+
def add_document(doc_id: str, content: str, realm_type: str, realm_label: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
"""Add a new document to the system"""
|
| 317 |
if not WARBLER_AVAILABLE or not api:
|
| 318 |
return "Warbler CDA not available."
|
|
|
|
| 362 |
metrics['retrieval_metrics']['hybrid_queries']}\n\n"
|
| 363 |
|
| 364 |
stats += "## Quality Distribution\n\n"
|
| 365 |
+
for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items():
|
|
|
|
| 366 |
stats += f"- {quality.capitalize()}: {count}\n"
|
| 367 |
|
| 368 |
return stats
|
|
|
|
| 397 |
|
| 398 |
with gr.Row():
|
| 399 |
max_results = gr.Slider(
|
| 400 |
+
minimum=1, maximum=10, value=5, step=1, label="Max Results"
|
| 401 |
+
)
|
| 402 |
+
use_hybrid = gr.Checkbox(label="Enable STAT7 Hybrid Scoring", value=True)
|
| 403 |
|
| 404 |
with gr.Row():
|
| 405 |
weight_semantic = gr.Slider(
|
| 406 |
+
minimum=0.0, maximum=1.0, value=0.6, step=0.1, label="Semantic Weight"
|
| 407 |
+
)
|
| 408 |
weight_stat7 = gr.Slider(
|
| 409 |
+
minimum=0.0, maximum=1.0, value=0.4, step=0.1, label="STAT7 Weight"
|
| 410 |
+
)
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
query_btn = gr.Button("Search", variant="primary")
|
| 413 |
|
|
|
|
| 431 |
|
| 432 |
query_btn.click(
|
| 433 |
fn=query_warbler,
|
| 434 |
+
inputs=[query_input, max_results, use_hybrid, weight_semantic, weight_stat7],
|
|
|
|
| 435 |
outputs=[results_output, metrics_output],
|
| 436 |
)
|
| 437 |
|
| 438 |
with gr.Tab("Add Document"):
|
| 439 |
with gr.Row():
|
| 440 |
with gr.Column():
|
| 441 |
+
doc_id_input = gr.Textbox(label="Document ID", placeholder="unique_doc_id")
|
|
|
|
| 442 |
content_input = gr.Textbox(
|
| 443 |
+
label="Content", placeholder="Enter document content...", lines=5
|
| 444 |
+
)
|
|
|
|
| 445 |
|
| 446 |
with gr.Row():
|
| 447 |
realm_type_input = gr.Dropdown(
|
| 448 |
+
choices=["wisdom", "technical", "narrative", "pattern", "data"],
|
|
|
|
| 449 |
value="wisdom",
|
| 450 |
label="Realm Type",
|
| 451 |
)
|
| 452 |
realm_label_input = gr.Textbox(
|
| 453 |
+
label="Realm Label", placeholder="e.g., philosophy, documentation"
|
| 454 |
+
)
|
| 455 |
|
| 456 |
add_btn = gr.Button("Add Document", variant="primary")
|
| 457 |
add_output = gr.Textbox(label="Status", lines=3)
|
| 458 |
|
| 459 |
add_btn.click(
|
| 460 |
fn=add_document,
|
| 461 |
+
inputs=[doc_id_input, content_input, realm_type_input, realm_label_input],
|
|
|
|
| 462 |
outputs=add_output,
|
| 463 |
)
|
| 464 |
|
convert_to_jsonl.py
CHANGED
|
@@ -1,31 +1,33 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
|
|
|
|
| 4 |
def convert_templates_to_jsonl(pack_dir):
|
| 5 |
"""Convert templates.json to pack_name.jsonl for a given pack directory."""
|
| 6 |
pack_name = os.path.basename(pack_dir)
|
| 7 |
-
templates_path = os.path.join(pack_dir,
|
| 8 |
-
jsonl_path = os.path.join(pack_dir, f
|
| 9 |
-
|
| 10 |
if not os.path.exists(templates_path):
|
| 11 |
print(f"No templates.json found in {pack_dir}")
|
| 12 |
return
|
| 13 |
-
|
| 14 |
-
with open(templates_path,
|
| 15 |
templates = json.load(f)
|
| 16 |
-
|
| 17 |
-
with open(jsonl_path,
|
| 18 |
for template in templates:
|
| 19 |
json.dump(template, f)
|
| 20 |
-
f.write(
|
| 21 |
-
|
| 22 |
print(f"Converted {templates_path} to {jsonl_path}")
|
| 23 |
|
|
|
|
| 24 |
# Convert the three default packs
|
| 25 |
packs_to_convert = [
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
]
|
| 30 |
|
| 31 |
for pack in packs_to_convert:
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
|
| 4 |
+
|
| 5 |
def convert_templates_to_jsonl(pack_dir):
|
| 6 |
"""Convert templates.json to pack_name.jsonl for a given pack directory."""
|
| 7 |
pack_name = os.path.basename(pack_dir)
|
| 8 |
+
templates_path = os.path.join(pack_dir, "pack", "templates.json")
|
| 9 |
+
jsonl_path = os.path.join(pack_dir, f"{pack_name}.jsonl")
|
| 10 |
+
|
| 11 |
if not os.path.exists(templates_path):
|
| 12 |
print(f"No templates.json found in {pack_dir}")
|
| 13 |
return
|
| 14 |
+
|
| 15 |
+
with open(templates_path, "r") as f:
|
| 16 |
templates = json.load(f)
|
| 17 |
+
|
| 18 |
+
with open(jsonl_path, "w") as f:
|
| 19 |
for template in templates:
|
| 20 |
json.dump(template, f)
|
| 21 |
+
f.write("\n")
|
| 22 |
+
|
| 23 |
print(f"Converted {templates_path} to {jsonl_path}")
|
| 24 |
|
| 25 |
+
|
| 26 |
# Convert the three default packs
|
| 27 |
packs_to_convert = [
|
| 28 |
+
"packs/warbler-pack-core",
|
| 29 |
+
"packs/warbler-pack-faction-politics",
|
| 30 |
+
"packs/warbler-pack-wisdom-scrolls",
|
| 31 |
]
|
| 32 |
|
| 33 |
for pack in packs_to_convert:
|
test_embedding_integration.py
CHANGED
|
@@ -26,7 +26,9 @@ def test_embedding_provider_factory():
|
|
| 26 |
def test_sentence_transformer_provider():
|
| 27 |
"""Test SentenceTransformer provider directly"""
|
| 28 |
try:
|
| 29 |
-
from warbler_cda.embeddings.sentence_transformer_provider import
|
|
|
|
|
|
|
| 30 |
|
| 31 |
provider = SentenceTransformerEmbeddingProvider()
|
| 32 |
|
|
@@ -39,7 +41,7 @@ def test_sentence_transformer_provider():
|
|
| 39 |
print(f" - Embedding dimension: {len(embedding)}")
|
| 40 |
print(f" - Embedding shape: {provider.get_dimension()}")
|
| 41 |
|
| 42 |
-
if hasattr(provider,
|
| 43 |
stat7_coords = provider.compute_stat7_from_embedding(embedding)
|
| 44 |
print(f" - STAT7 coordinates computed: {list(stat7_coords.keys())}")
|
| 45 |
|
|
@@ -47,6 +49,7 @@ def test_sentence_transformer_provider():
|
|
| 47 |
except Exception as e:
|
| 48 |
print(f"✗ Failed to create SentenceTransformer provider: {e}")
|
| 49 |
import traceback
|
|
|
|
| 50 |
traceback.print_exc()
|
| 51 |
return False
|
| 52 |
|
|
@@ -59,20 +62,19 @@ def test_retrieval_api_with_embeddings():
|
|
| 59 |
|
| 60 |
embedding_provider = EmbeddingProviderFactory.get_default_provider()
|
| 61 |
api = RetrievalAPI(
|
| 62 |
-
embedding_provider=embedding_provider,
|
| 63 |
-
config={"enable_stat7_hybrid": True}
|
| 64 |
)
|
| 65 |
|
| 66 |
doc1 = {
|
| 67 |
"id": "doc1",
|
| 68 |
"content": "The quick brown fox jumps over the lazy dog.",
|
| 69 |
-
"metadata": {"type": "test"}
|
| 70 |
}
|
| 71 |
|
| 72 |
doc2 = {
|
| 73 |
-
"id": "doc2",
|
| 74 |
"content": "Semantic embeddings enable efficient document retrieval.",
|
| 75 |
-
"metadata": {"type": "test"}
|
| 76 |
}
|
| 77 |
|
| 78 |
api.add_document(doc1["id"], doc1["content"], doc1["metadata"])
|
|
@@ -86,7 +88,7 @@ def test_retrieval_api_with_embeddings():
|
|
| 86 |
mode=RetrievalMode.SEMANTIC_SIMILARITY,
|
| 87 |
semantic_query="fast animal jumps",
|
| 88 |
max_results=5,
|
| 89 |
-
confidence_threshold=0.3
|
| 90 |
)
|
| 91 |
|
| 92 |
assembly = api.retrieve_context(query)
|
|
@@ -98,6 +100,7 @@ def test_retrieval_api_with_embeddings():
|
|
| 98 |
except Exception as e:
|
| 99 |
print(f"✗ Failed RetrievalAPI test: {e}")
|
| 100 |
import traceback
|
|
|
|
| 101 |
traceback.print_exc()
|
| 102 |
return False
|
| 103 |
|
|
@@ -105,7 +108,9 @@ def test_retrieval_api_with_embeddings():
|
|
| 105 |
def test_embedding_cache():
|
| 106 |
"""Test embedding cache functionality"""
|
| 107 |
try:
|
| 108 |
-
from warbler_cda.embeddings.sentence_transformer_provider import
|
|
|
|
|
|
|
| 109 |
|
| 110 |
provider = SentenceTransformerEmbeddingProvider()
|
| 111 |
|
|
@@ -141,12 +146,10 @@ def main():
|
|
| 141 |
results.append(("Factory", test_embedding_provider_factory()))
|
| 142 |
|
| 143 |
print("\n2. Testing SentenceTransformer Provider...")
|
| 144 |
-
results.append(("SentenceTransformer",
|
| 145 |
-
test_sentence_transformer_provider()))
|
| 146 |
|
| 147 |
print("\n3. Testing RetrievalAPI Integration...")
|
| 148 |
-
results.append(("RetrievalAPI",
|
| 149 |
-
test_retrieval_api_with_embeddings()))
|
| 150 |
|
| 151 |
print("\n4. Testing Embedding Cache...")
|
| 152 |
results.append(("Cache", test_embedding_cache()))
|
|
|
|
| 26 |
def test_sentence_transformer_provider():
|
| 27 |
"""Test SentenceTransformer provider directly"""
|
| 28 |
try:
|
| 29 |
+
from warbler_cda.embeddings.sentence_transformer_provider import (
|
| 30 |
+
SentenceTransformerEmbeddingProvider,
|
| 31 |
+
)
|
| 32 |
|
| 33 |
provider = SentenceTransformerEmbeddingProvider()
|
| 34 |
|
|
|
|
| 41 |
print(f" - Embedding dimension: {len(embedding)}")
|
| 42 |
print(f" - Embedding shape: {provider.get_dimension()}")
|
| 43 |
|
| 44 |
+
if hasattr(provider, "compute_stat7_from_embedding"):
|
| 45 |
stat7_coords = provider.compute_stat7_from_embedding(embedding)
|
| 46 |
print(f" - STAT7 coordinates computed: {list(stat7_coords.keys())}")
|
| 47 |
|
|
|
|
| 49 |
except Exception as e:
|
| 50 |
print(f"✗ Failed to create SentenceTransformer provider: {e}")
|
| 51 |
import traceback
|
| 52 |
+
|
| 53 |
traceback.print_exc()
|
| 54 |
return False
|
| 55 |
|
|
|
|
| 62 |
|
| 63 |
embedding_provider = EmbeddingProviderFactory.get_default_provider()
|
| 64 |
api = RetrievalAPI(
|
| 65 |
+
embedding_provider=embedding_provider, config={"enable_stat7_hybrid": True}
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
doc1 = {
|
| 69 |
"id": "doc1",
|
| 70 |
"content": "The quick brown fox jumps over the lazy dog.",
|
| 71 |
+
"metadata": {"type": "test"},
|
| 72 |
}
|
| 73 |
|
| 74 |
doc2 = {
|
| 75 |
+
"id": "doc2",
|
| 76 |
"content": "Semantic embeddings enable efficient document retrieval.",
|
| 77 |
+
"metadata": {"type": "test"},
|
| 78 |
}
|
| 79 |
|
| 80 |
api.add_document(doc1["id"], doc1["content"], doc1["metadata"])
|
|
|
|
| 88 |
mode=RetrievalMode.SEMANTIC_SIMILARITY,
|
| 89 |
semantic_query="fast animal jumps",
|
| 90 |
max_results=5,
|
| 91 |
+
confidence_threshold=0.3,
|
| 92 |
)
|
| 93 |
|
| 94 |
assembly = api.retrieve_context(query)
|
|
|
|
| 100 |
except Exception as e:
|
| 101 |
print(f"✗ Failed RetrievalAPI test: {e}")
|
| 102 |
import traceback
|
| 103 |
+
|
| 104 |
traceback.print_exc()
|
| 105 |
return False
|
| 106 |
|
|
|
|
| 108 |
def test_embedding_cache():
|
| 109 |
"""Test embedding cache functionality"""
|
| 110 |
try:
|
| 111 |
+
from warbler_cda.embeddings.sentence_transformer_provider import (
|
| 112 |
+
SentenceTransformerEmbeddingProvider,
|
| 113 |
+
)
|
| 114 |
|
| 115 |
provider = SentenceTransformerEmbeddingProvider()
|
| 116 |
|
|
|
|
| 146 |
results.append(("Factory", test_embedding_provider_factory()))
|
| 147 |
|
| 148 |
print("\n2. Testing SentenceTransformer Provider...")
|
| 149 |
+
results.append(("SentenceTransformer", test_sentence_transformer_provider()))
|
|
|
|
| 150 |
|
| 151 |
print("\n3. Testing RetrievalAPI Integration...")
|
| 152 |
+
results.append(("RetrievalAPI", test_retrieval_api_with_embeddings()))
|
|
|
|
| 153 |
|
| 154 |
print("\n4. Testing Embedding Cache...")
|
| 155 |
results.append(("Cache", test_embedding_cache()))
|
warbler_cda/api/cli.py
CHANGED
|
@@ -38,25 +38,20 @@ class APIClient:
|
|
| 38 |
|
| 39 |
def single_query(self, query_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 40 |
"""Execute single query"""
|
| 41 |
-
response = self.session.post(
|
| 42 |
-
f"{self.base_url}/query", json=query_data, timeout=30)
|
| 43 |
response.raise_for_status()
|
| 44 |
return response.json()
|
| 45 |
|
| 46 |
-
def bulk_query(
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
concurrency: int = 5,
|
| 50 |
-
include_narrative: bool = False) -> Dict[str,
|
| 51 |
-
Any]:
|
| 52 |
"""Execute bulk concurrent queries"""
|
| 53 |
payload = {
|
| 54 |
"queries": queries,
|
| 55 |
"concurrency_level": concurrency,
|
| 56 |
"include_narrative_analysis": include_narrative,
|
| 57 |
}
|
| 58 |
-
response = self.session.post(
|
| 59 |
-
f"{self.base_url}/bulk_query", json=payload, timeout=120)
|
| 60 |
response.raise_for_status()
|
| 61 |
return response.json()
|
| 62 |
|
|
@@ -68,16 +63,13 @@ class APIClient:
|
|
| 68 |
|
| 69 |
def reset_metrics(self) -> Dict[str, Any]:
|
| 70 |
"""Reset service metrics"""
|
| 71 |
-
response = self.session.post(
|
| 72 |
-
f"{self.base_url}/metrics/reset", timeout=5)
|
| 73 |
response.raise_for_status()
|
| 74 |
return response.json()
|
| 75 |
|
| 76 |
|
| 77 |
@click.group()
|
| 78 |
-
@click.option("--api-url",
|
| 79 |
-
default="http://localhost:8000",
|
| 80 |
-
help="API service URL")
|
| 81 |
@click.pass_context
|
| 82 |
def cli(ctx, api_url):
|
| 83 |
"""EXP-09 CLI - STAT7 Retrieval API Command Line Interface"""
|
|
@@ -106,12 +98,14 @@ def health(ctx):
|
|
| 106 |
f" Concurrent Queries: {
|
| 107 |
health_data.get(
|
| 108 |
'concurrent_queries',
|
| 109 |
-
0)}"
|
|
|
|
| 110 |
click.echo(
|
| 111 |
f" Max Concurrent Observed: {
|
| 112 |
health_data.get(
|
| 113 |
'max_concurrent_observed',
|
| 114 |
-
0)}"
|
|
|
|
| 115 |
click.echo(f" Hybrid Queries: {health_data.get('hybrid_queries', 0)}")
|
| 116 |
click.echo(f" Errors: {health_data.get('errors', 0)}")
|
| 117 |
else:
|
|
@@ -127,10 +121,8 @@ def health(ctx):
|
|
| 127 |
@click.option("--hybrid", is_flag=True, help="Enable STAT7 hybrid scoring")
|
| 128 |
@click.option("--max-results", default=10, help="Maximum results to return")
|
| 129 |
@click.option("--confidence", default=0.6, help="Confidence threshold")
|
| 130 |
-
@click.option("--weight-semantic", default=0.6,
|
| 131 |
-
|
| 132 |
-
@click.option("--weight-stat7", default=0.4,
|
| 133 |
-
help="STAT7 weight in hybrid mode")
|
| 134 |
@click.option("--json-output", is_flag=True, help="Output as JSON")
|
| 135 |
@click.pass_context
|
| 136 |
def query(
|
|
@@ -173,30 +165,26 @@ def query(
|
|
| 173 |
click.echo(f"Query: {result.get('query_id')}")
|
| 174 |
click.echo("=" * 60)
|
| 175 |
click.echo(f"Results: {result.get('result_count')}")
|
| 176 |
-
click.echo(
|
| 177 |
-
f"Execution Time: {result.get('execution_time_ms'):.1f}ms")
|
| 178 |
|
| 179 |
if result.get("semantic_similarity"):
|
| 180 |
click.echo(
|
| 181 |
f"Semantic Similarity: {
|
| 182 |
-
result.get('semantic_similarity'):.3f}"
|
|
|
|
| 183 |
if result.get("stat7_resonance"):
|
| 184 |
-
click.echo(
|
| 185 |
-
f"STAT7 Resonance: {result.get('stat7_resonance'):.3f}")
|
| 186 |
|
| 187 |
# Show narrative analysis
|
| 188 |
if result.get("narrative_analysis"):
|
| 189 |
narr = result["narrative_analysis"]
|
| 190 |
click.echo(f"\nNarrative Analysis:")
|
| 191 |
-
click.echo(
|
| 192 |
-
|
| 193 |
-
click.echo(
|
| 194 |
-
f" Narrative Threads: {narr.get('narrative_threads', 0)}")
|
| 195 |
click.echo(f" Analysis: {narr.get('analysis')}")
|
| 196 |
|
| 197 |
# Show results
|
| 198 |
-
click.echo(
|
| 199 |
-
f"\nTop Results ({min(3, len(result.get('results', [])))}):")
|
| 200 |
for i, res in enumerate(result.get("results", [])[:3], 1):
|
| 201 |
click.echo(
|
| 202 |
f" {i}. Score: {
|
|
@@ -206,7 +194,8 @@ def query(
|
|
| 206 |
res.get(
|
| 207 |
'content',
|
| 208 |
'N/A')[
|
| 209 |
-
:50]}..."
|
|
|
|
| 210 |
|
| 211 |
click.echo()
|
| 212 |
|
|
@@ -217,10 +206,8 @@ def query(
|
|
| 217 |
@cli.command()
|
| 218 |
@click.option("--num-queries", default=5, help="Number of concurrent queries")
|
| 219 |
@click.option("--concurrency", default=5, help="Concurrency level")
|
| 220 |
-
@click.option("--semantic", multiple=True,
|
| 221 |
-
|
| 222 |
-
@click.option("--hybrid", is_flag=True,
|
| 223 |
-
help="Enable STAT7 hybrid for all queries")
|
| 224 |
@click.option("--json-output", is_flag=True, help="Output as JSON")
|
| 225 |
@click.pass_context
|
| 226 |
def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
|
|
@@ -256,11 +243,11 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
|
|
| 256 |
try:
|
| 257 |
click.echo(
|
| 258 |
f"\nExecuting {
|
| 259 |
-
len(query_data)} concurrent queries (concurrency={concurrency})..."
|
|
|
|
| 260 |
start_time = time.time()
|
| 261 |
|
| 262 |
-
result = client.bulk_query(
|
| 263 |
-
query_data, concurrency=concurrency, include_narrative=True)
|
| 264 |
|
| 265 |
elapsed = time.time() - start_time
|
| 266 |
|
|
@@ -276,21 +263,21 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
|
|
| 276 |
click.echo(f"Failed: {result.get('failed')} ✗")
|
| 277 |
click.echo(
|
| 278 |
f"Total Execution Time: {
|
| 279 |
-
result.get('execution_time_ms'):.1f}ms"
|
| 280 |
-
|
| 281 |
-
|
| 282 |
|
| 283 |
# Narrative analysis for entire batch
|
| 284 |
if result.get("batch_narrative_analysis"):
|
| 285 |
narr = result["batch_narrative_analysis"]
|
| 286 |
click.echo(f"\nBatch Narrative Analysis:")
|
| 287 |
-
click.echo(
|
| 288 |
-
f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
|
| 289 |
click.echo(
|
| 290 |
f" Total Narrative Threads: {
|
| 291 |
narr.get(
|
| 292 |
'narrative_threads',
|
| 293 |
-
0)}"
|
|
|
|
| 294 |
click.echo(f" Total Results: {narr.get('result_count', 0)}")
|
| 295 |
click.echo(f" Analysis: {narr.get('analysis')}")
|
| 296 |
|
|
@@ -301,7 +288,8 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
|
|
| 301 |
f" {
|
| 302 |
res.get('query_id')}: {
|
| 303 |
res.get('result_count')} results in {
|
| 304 |
-
res.get('execution_time_ms'):.1f}ms"
|
|
|
|
| 305 |
|
| 306 |
click.echo()
|
| 307 |
|
|
@@ -329,7 +317,8 @@ def metrics(ctx, json_output):
|
|
| 329 |
click.echo(f"Total Queries: {metrics_data.get('total_queries')}")
|
| 330 |
click.echo(
|
| 331 |
f"Concurrent Queries: {
|
| 332 |
-
metrics_data.get('concurrent_queries')}"
|
|
|
|
| 333 |
click.echo(f"Max Concurrent: {metrics_data.get('max_concurrent')}")
|
| 334 |
click.echo(f"Hybrid Queries: {metrics_data.get('hybrid_queries')}")
|
| 335 |
click.echo(f"Errors: {metrics_data.get('errors')}")
|
|
@@ -354,19 +343,12 @@ def reset_metrics(ctx):
|
|
| 354 |
|
| 355 |
|
| 356 |
@cli.command()
|
| 357 |
-
@click.option("--num-scenarios", default=3,
|
| 358 |
-
|
| 359 |
-
@click.option("--queries-per-scenario", default=10,
|
| 360 |
-
help="Queries per scenario")
|
| 361 |
@click.option("--use-hybrid", is_flag=True, help="Use STAT7 hybrid scoring")
|
| 362 |
@click.option("--output-file", help="Save results to file")
|
| 363 |
@click.pass_context
|
| 364 |
-
def stress_test(
|
| 365 |
-
ctx,
|
| 366 |
-
num_scenarios,
|
| 367 |
-
queries_per_scenario,
|
| 368 |
-
use_hybrid,
|
| 369 |
-
output_file):
|
| 370 |
"""Run EXP-10 narrative preservation stress test"""
|
| 371 |
client = ctx.obj["client"]
|
| 372 |
|
|
@@ -415,8 +397,7 @@ def stress_test(
|
|
| 415 |
)
|
| 416 |
|
| 417 |
try:
|
| 418 |
-
result = client.bulk_query(
|
| 419 |
-
queries, concurrency=10, include_narrative=True)
|
| 420 |
|
| 421 |
scenario_result = {
|
| 422 |
"scenario": scenario,
|
|
@@ -424,35 +405,29 @@ def stress_test(
|
|
| 424 |
"successful": result.get("successful"),
|
| 425 |
"failed": result.get("failed"),
|
| 426 |
"avg_query_time_ms": result.get("avg_query_time_ms"),
|
| 427 |
-
"batch_coherence": result.get(
|
| 428 |
-
"
|
| 429 |
-
|
| 430 |
-
"coherence_score",
|
| 431 |
-
0),
|
| 432 |
}
|
| 433 |
|
| 434 |
results_summary["scenarios"].append(scenario_result)
|
| 435 |
results_summary["total_coherence_score"] += scenario_result["batch_coherence"]
|
| 436 |
|
| 437 |
-
click.echo(
|
| 438 |
-
|
| 439 |
-
click.echo(
|
| 440 |
-
f" Coherence: {scenario_result['batch_coherence']:.3f}")
|
| 441 |
|
| 442 |
except Exception as e:
|
| 443 |
click.secho(f" ✗ Scenario failed: {str(e)}", fg="red")
|
| 444 |
|
| 445 |
# Summary
|
| 446 |
-
avg_coherence = results_summary["total_coherence_score"] /
|
| 447 |
-
max(1, num_scenarios)
|
| 448 |
results_summary["average_coherence"] = avg_coherence
|
| 449 |
results_summary["end_time"] = datetime.now().isoformat()
|
| 450 |
|
| 451 |
click.echo("\n" + "=" * 60)
|
| 452 |
click.echo("Stress Test Summary")
|
| 453 |
click.echo("=" * 60)
|
| 454 |
-
click.echo(
|
| 455 |
-
f"Scenarios Completed: {len(results_summary['scenarios'])}/{num_scenarios}")
|
| 456 |
click.echo(f"Average Coherence Score: {avg_coherence:.3f}")
|
| 457 |
click.echo(f"Result: {'PASS ✓' if avg_coherence > 0.7 else 'FAIL ✗'}")
|
| 458 |
click.echo()
|
|
|
|
| 38 |
|
| 39 |
def single_query(self, query_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 40 |
"""Execute single query"""
|
| 41 |
+
response = self.session.post(f"{self.base_url}/query", json=query_data, timeout=30)
|
|
|
|
| 42 |
response.raise_for_status()
|
| 43 |
return response.json()
|
| 44 |
|
| 45 |
+
def bulk_query(
|
| 46 |
+
self, queries: List[Dict[str, Any]], concurrency: int = 5, include_narrative: bool = False
|
| 47 |
+
) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
| 48 |
"""Execute bulk concurrent queries"""
|
| 49 |
payload = {
|
| 50 |
"queries": queries,
|
| 51 |
"concurrency_level": concurrency,
|
| 52 |
"include_narrative_analysis": include_narrative,
|
| 53 |
}
|
| 54 |
+
response = self.session.post(f"{self.base_url}/bulk_query", json=payload, timeout=120)
|
|
|
|
| 55 |
response.raise_for_status()
|
| 56 |
return response.json()
|
| 57 |
|
|
|
|
| 63 |
|
| 64 |
def reset_metrics(self) -> Dict[str, Any]:
|
| 65 |
"""Reset service metrics"""
|
| 66 |
+
response = self.session.post(f"{self.base_url}/metrics/reset", timeout=5)
|
|
|
|
| 67 |
response.raise_for_status()
|
| 68 |
return response.json()
|
| 69 |
|
| 70 |
|
| 71 |
@click.group()
|
| 72 |
+
@click.option("--api-url", default="http://localhost:8000", help="API service URL")
|
|
|
|
|
|
|
| 73 |
@click.pass_context
|
| 74 |
def cli(ctx, api_url):
|
| 75 |
"""EXP-09 CLI - STAT7 Retrieval API Command Line Interface"""
|
|
|
|
| 98 |
f" Concurrent Queries: {
|
| 99 |
health_data.get(
|
| 100 |
'concurrent_queries',
|
| 101 |
+
0)}"
|
| 102 |
+
)
|
| 103 |
click.echo(
|
| 104 |
f" Max Concurrent Observed: {
|
| 105 |
health_data.get(
|
| 106 |
'max_concurrent_observed',
|
| 107 |
+
0)}"
|
| 108 |
+
)
|
| 109 |
click.echo(f" Hybrid Queries: {health_data.get('hybrid_queries', 0)}")
|
| 110 |
click.echo(f" Errors: {health_data.get('errors', 0)}")
|
| 111 |
else:
|
|
|
|
| 121 |
@click.option("--hybrid", is_flag=True, help="Enable STAT7 hybrid scoring")
|
| 122 |
@click.option("--max-results", default=10, help="Maximum results to return")
|
| 123 |
@click.option("--confidence", default=0.6, help="Confidence threshold")
|
| 124 |
+
@click.option("--weight-semantic", default=0.6, help="Semantic weight in hybrid mode")
|
| 125 |
+
@click.option("--weight-stat7", default=0.4, help="STAT7 weight in hybrid mode")
|
|
|
|
|
|
|
| 126 |
@click.option("--json-output", is_flag=True, help="Output as JSON")
|
| 127 |
@click.pass_context
|
| 128 |
def query(
|
|
|
|
| 165 |
click.echo(f"Query: {result.get('query_id')}")
|
| 166 |
click.echo("=" * 60)
|
| 167 |
click.echo(f"Results: {result.get('result_count')}")
|
| 168 |
+
click.echo(f"Execution Time: {result.get('execution_time_ms'):.1f}ms")
|
|
|
|
| 169 |
|
| 170 |
if result.get("semantic_similarity"):
|
| 171 |
click.echo(
|
| 172 |
f"Semantic Similarity: {
|
| 173 |
+
result.get('semantic_similarity'):.3f}"
|
| 174 |
+
)
|
| 175 |
if result.get("stat7_resonance"):
|
| 176 |
+
click.echo(f"STAT7 Resonance: {result.get('stat7_resonance'):.3f}")
|
|
|
|
| 177 |
|
| 178 |
# Show narrative analysis
|
| 179 |
if result.get("narrative_analysis"):
|
| 180 |
narr = result["narrative_analysis"]
|
| 181 |
click.echo(f"\nNarrative Analysis:")
|
| 182 |
+
click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
|
| 183 |
+
click.echo(f" Narrative Threads: {narr.get('narrative_threads', 0)}")
|
|
|
|
|
|
|
| 184 |
click.echo(f" Analysis: {narr.get('analysis')}")
|
| 185 |
|
| 186 |
# Show results
|
| 187 |
+
click.echo(f"\nTop Results ({min(3, len(result.get('results', [])))}):")
|
|
|
|
| 188 |
for i, res in enumerate(result.get("results", [])[:3], 1):
|
| 189 |
click.echo(
|
| 190 |
f" {i}. Score: {
|
|
|
|
| 194 |
res.get(
|
| 195 |
'content',
|
| 196 |
'N/A')[
|
| 197 |
+
:50]}..."
|
| 198 |
+
)
|
| 199 |
|
| 200 |
click.echo()
|
| 201 |
|
|
|
|
| 206 |
@cli.command()
|
| 207 |
@click.option("--num-queries", default=5, help="Number of concurrent queries")
|
| 208 |
@click.option("--concurrency", default=5, help="Concurrency level")
|
| 209 |
+
@click.option("--semantic", multiple=True, help="Semantic queries (can specify multiple)")
|
| 210 |
+
@click.option("--hybrid", is_flag=True, help="Enable STAT7 hybrid for all queries")
|
|
|
|
|
|
|
| 211 |
@click.option("--json-output", is_flag=True, help="Output as JSON")
|
| 212 |
@click.pass_context
|
| 213 |
def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
|
|
|
|
| 243 |
try:
|
| 244 |
click.echo(
|
| 245 |
f"\nExecuting {
|
| 246 |
+
len(query_data)} concurrent queries (concurrency={concurrency})..."
|
| 247 |
+
)
|
| 248 |
start_time = time.time()
|
| 249 |
|
| 250 |
+
result = client.bulk_query(query_data, concurrency=concurrency, include_narrative=True)
|
|
|
|
| 251 |
|
| 252 |
elapsed = time.time() - start_time
|
| 253 |
|
|
|
|
| 263 |
click.echo(f"Failed: {result.get('failed')} ✗")
|
| 264 |
click.echo(
|
| 265 |
f"Total Execution Time: {
|
| 266 |
+
result.get('execution_time_ms'):.1f}ms"
|
| 267 |
+
)
|
| 268 |
+
click.echo(f"Avg Query Time: {result.get('avg_query_time_ms'):.1f}ms")
|
| 269 |
|
| 270 |
# Narrative analysis for entire batch
|
| 271 |
if result.get("batch_narrative_analysis"):
|
| 272 |
narr = result["batch_narrative_analysis"]
|
| 273 |
click.echo(f"\nBatch Narrative Analysis:")
|
| 274 |
+
click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
|
|
|
|
| 275 |
click.echo(
|
| 276 |
f" Total Narrative Threads: {
|
| 277 |
narr.get(
|
| 278 |
'narrative_threads',
|
| 279 |
+
0)}"
|
| 280 |
+
)
|
| 281 |
click.echo(f" Total Results: {narr.get('result_count', 0)}")
|
| 282 |
click.echo(f" Analysis: {narr.get('analysis')}")
|
| 283 |
|
|
|
|
| 288 |
f" {
|
| 289 |
res.get('query_id')}: {
|
| 290 |
res.get('result_count')} results in {
|
| 291 |
+
res.get('execution_time_ms'):.1f}ms"
|
| 292 |
+
)
|
| 293 |
|
| 294 |
click.echo()
|
| 295 |
|
|
|
|
| 317 |
click.echo(f"Total Queries: {metrics_data.get('total_queries')}")
|
| 318 |
click.echo(
|
| 319 |
f"Concurrent Queries: {
|
| 320 |
+
metrics_data.get('concurrent_queries')}"
|
| 321 |
+
)
|
| 322 |
click.echo(f"Max Concurrent: {metrics_data.get('max_concurrent')}")
|
| 323 |
click.echo(f"Hybrid Queries: {metrics_data.get('hybrid_queries')}")
|
| 324 |
click.echo(f"Errors: {metrics_data.get('errors')}")
|
|
|
|
| 343 |
|
| 344 |
|
| 345 |
@cli.command()
|
| 346 |
+
@click.option("--num-scenarios", default=3, help="Number of concurrent test scenarios")
|
| 347 |
+
@click.option("--queries-per-scenario", default=10, help="Queries per scenario")
|
|
|
|
|
|
|
| 348 |
@click.option("--use-hybrid", is_flag=True, help="Use STAT7 hybrid scoring")
|
| 349 |
@click.option("--output-file", help="Save results to file")
|
| 350 |
@click.pass_context
|
| 351 |
+
def stress_test(ctx, num_scenarios, queries_per_scenario, use_hybrid, output_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
"""Run EXP-10 narrative preservation stress test"""
|
| 353 |
client = ctx.obj["client"]
|
| 354 |
|
|
|
|
| 397 |
)
|
| 398 |
|
| 399 |
try:
|
| 400 |
+
result = client.bulk_query(queries, concurrency=10, include_narrative=True)
|
|
|
|
| 401 |
|
| 402 |
scenario_result = {
|
| 403 |
"scenario": scenario,
|
|
|
|
| 405 |
"successful": result.get("successful"),
|
| 406 |
"failed": result.get("failed"),
|
| 407 |
"avg_query_time_ms": result.get("avg_query_time_ms"),
|
| 408 |
+
"batch_coherence": result.get("batch_narrative_analysis", {}).get(
|
| 409 |
+
"coherence_score", 0
|
| 410 |
+
),
|
|
|
|
|
|
|
| 411 |
}
|
| 412 |
|
| 413 |
results_summary["scenarios"].append(scenario_result)
|
| 414 |
results_summary["total_coherence_score"] += scenario_result["batch_coherence"]
|
| 415 |
|
| 416 |
+
click.echo(f" ✓ {result.get('successful')}/{len(queries)} queries successful")
|
| 417 |
+
click.echo(f" Coherence: {scenario_result['batch_coherence']:.3f}")
|
|
|
|
|
|
|
| 418 |
|
| 419 |
except Exception as e:
|
| 420 |
click.secho(f" ✗ Scenario failed: {str(e)}", fg="red")
|
| 421 |
|
| 422 |
# Summary
|
| 423 |
+
avg_coherence = results_summary["total_coherence_score"] / max(1, num_scenarios)
|
|
|
|
| 424 |
results_summary["average_coherence"] = avg_coherence
|
| 425 |
results_summary["end_time"] = datetime.now().isoformat()
|
| 426 |
|
| 427 |
click.echo("\n" + "=" * 60)
|
| 428 |
click.echo("Stress Test Summary")
|
| 429 |
click.echo("=" * 60)
|
| 430 |
+
click.echo(f"Scenarios Completed: {len(results_summary['scenarios'])}/{num_scenarios}")
|
|
|
|
| 431 |
click.echo(f"Average Coherence Score: {avg_coherence:.3f}")
|
| 432 |
click.echo(f"Result: {'PASS ✓' if avg_coherence > 0.7 else 'FAIL ✗'}")
|
| 433 |
click.echo()
|
warbler_cda/api/service.py
CHANGED
|
@@ -47,8 +47,7 @@ class STAT7Address(BaseModel):
|
|
| 47 |
"""STAT7 coordinate specification"""
|
| 48 |
|
| 49 |
realm: Dict[str, Any] = Field(
|
| 50 |
-
default_factory=lambda: {
|
| 51 |
-
"type": "retrieval_query", "label": "api_query"}
|
| 52 |
)
|
| 53 |
lineage: int = 0
|
| 54 |
adjacency: str = "semantic_proximity"
|
|
@@ -128,8 +127,7 @@ def _init_api():
|
|
| 128 |
return _api_instance
|
| 129 |
|
| 130 |
|
| 131 |
-
def _analyze_narrative_coherence(
|
| 132 |
-
results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 133 |
"""
|
| 134 |
Analyze narrative coherence across results.
|
| 135 |
Used to validate that meaning/story threads survive concurrent access.
|
|
@@ -179,12 +177,9 @@ def _analyze_narrative_coherence(
|
|
| 179 |
relevance_scores.append(result.get("relevance_score", 0.0))
|
| 180 |
|
| 181 |
# Calculate coherence components
|
| 182 |
-
avg_semantic = sum(semantic_scores) /
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
len(stat7_resonances) if stat7_resonances else 0.0
|
| 186 |
-
avg_relevance = sum(relevance_scores) / \
|
| 187 |
-
len(relevance_scores) if relevance_scores else 0.0
|
| 188 |
|
| 189 |
# 1. RESULT QUALITY (50% weight): Average relevance of all results
|
| 190 |
# This is the primary signal - if results aren't relevant, nothing else
|
|
@@ -197,8 +192,7 @@ def _analyze_narrative_coherence(
|
|
| 197 |
semantic_variance = sum((s - avg_semantic) ** 2 for s in semantic_scores) / max(
|
| 198 |
1, len(semantic_scores)
|
| 199 |
)
|
| 200 |
-
semantic_coherence = 1.0 /
|
| 201 |
-
(1.0 + semantic_variance) if semantic_variance < 1.0 else 0.0
|
| 202 |
|
| 203 |
# 3. STAT7 ENTANGLEMENT (10% weight): Are results connected in STAT7 space?
|
| 204 |
stat7_coherence = avg_stat7
|
|
@@ -231,14 +225,18 @@ def _analyze_narrative_coherence(
|
|
| 231 |
if len(results) > 50: # Only log for bulk operations
|
| 232 |
logger.info(
|
| 233 |
f"Coherence analysis for {
|
| 234 |
-
len(results)} results: "
|
|
|
|
| 235 |
quality_score:.3f}, semantic_coh={
|
| 236 |
semantic_coherence:.3f} (var={
|
| 237 |
-
semantic_variance:.4f}), "
|
|
|
|
| 238 |
stat7_coherence:.3f}, focus={
|
| 239 |
-
focus_coherence:.3f}, "
|
|
|
|
| 240 |
len(narrative_threads)}, final={
|
| 241 |
-
coherence_score:.3f}"
|
|
|
|
| 242 |
|
| 243 |
return {
|
| 244 |
"coherence_score": coherence_score,
|
|
@@ -306,7 +304,8 @@ async def _stress_test_result(
|
|
| 306 |
if query.stat7_hybrid and query.semantic_query:
|
| 307 |
logger.info(
|
| 308 |
f"Bob Test 1: Pure semantic retrieval for query {
|
| 309 |
-
query.query_id}"
|
|
|
|
| 310 |
semantic_query = RetrievalQuery(
|
| 311 |
query_id=f"{query.query_id}_bob_semantic",
|
| 312 |
mode=RetrievalMode.SEMANTIC_SIMILARITY,
|
|
@@ -316,8 +315,7 @@ async def _stress_test_result(
|
|
| 316 |
)
|
| 317 |
semantic_assembly = api.retrieve_context(semantic_query)
|
| 318 |
semantic_ids = set(r.content_id for r in semantic_assembly.results)
|
| 319 |
-
semantic_overlap = len(
|
| 320 |
-
original_ids & semantic_ids) / max(1, len(original_ids))
|
| 321 |
|
| 322 |
log["tests_run"].append(
|
| 323 |
{
|
|
@@ -329,8 +327,7 @@ async def _stress_test_result(
|
|
| 329 |
|
| 330 |
# Test 2: Pure STAT7 retrieval (if hybrid was used)
|
| 331 |
if query.stat7_hybrid and query.stat7_address:
|
| 332 |
-
logger.info(
|
| 333 |
-
f"Bob Test 2: Pure STAT7 retrieval for query {query.query_id}")
|
| 334 |
stat7_query = RetrievalQuery(
|
| 335 |
query_id=f"{query.query_id}_bob_stat7",
|
| 336 |
mode=RetrievalMode.STAT7_ADDRESS,
|
|
@@ -340,8 +337,7 @@ async def _stress_test_result(
|
|
| 340 |
)
|
| 341 |
stat7_assembly = api.retrieve_context(stat7_query)
|
| 342 |
stat7_ids = set(r.content_id for r in stat7_assembly.results)
|
| 343 |
-
stat7_overlap = len(original_ids & stat7_ids) /
|
| 344 |
-
max(1, len(original_ids))
|
| 345 |
|
| 346 |
log["tests_run"].append(
|
| 347 |
{
|
|
@@ -356,21 +352,20 @@ async def _stress_test_result(
|
|
| 356 |
if query.confidence_threshold < 0.8:
|
| 357 |
logger.info(
|
| 358 |
f"Bob Test 3: Higher confidence threshold for query {
|
| 359 |
-
query.query_id}"
|
|
|
|
| 360 |
high_conf_query = RetrievalQuery(
|
| 361 |
query_id=f"{query.query_id}_bob_high_conf",
|
| 362 |
mode=query.mode,
|
| 363 |
semantic_query=query.semantic_query,
|
| 364 |
anchor_ids=query.anchor_ids,
|
| 365 |
max_results=query.max_results,
|
| 366 |
-
confidence_threshold=min(
|
| 367 |
-
0.85, query.confidence_threshold + 0.2),
|
| 368 |
stat7_hybrid=query.stat7_hybrid,
|
| 369 |
stat7_address=query.stat7_address,
|
| 370 |
)
|
| 371 |
high_conf_assembly = api.retrieve_context(high_conf_query)
|
| 372 |
-
high_conf_ids = set(
|
| 373 |
-
r.content_id for r in high_conf_assembly.results)
|
| 374 |
high_conf_overlap = len(original_ids & high_conf_ids) / max(
|
| 375 |
1, min(len(original_ids), len(high_conf_ids))
|
| 376 |
)
|
|
@@ -390,8 +385,7 @@ async def _stress_test_result(
|
|
| 390 |
avg_consistency = sum(consistency_scores) / len(consistency_scores)
|
| 391 |
log["consistency_score"] = avg_consistency
|
| 392 |
|
| 393 |
-
if avg_consistency >= (
|
| 394 |
-
1.0 - BobSkepticConfig.STRESS_TEST_DIVERGENCE_THRESHOLD):
|
| 395 |
log["verdict"] = "CONSISTENT"
|
| 396 |
is_consistent = True
|
| 397 |
else:
|
|
@@ -405,8 +399,7 @@ async def _stress_test_result(
|
|
| 405 |
log["stress_test_completed"] = datetime.now().isoformat()
|
| 406 |
|
| 407 |
except Exception as e:
|
| 408 |
-
logger.error(
|
| 409 |
-
f"Error during Bob's stress test for {query.query_id}: {str(e)}")
|
| 410 |
log["error"] = str(e)
|
| 411 |
log["verdict"] = "ERROR_DURING_TEST"
|
| 412 |
is_consistent = False # Err on side of caution
|
|
@@ -454,19 +447,24 @@ async def _bob_skeptic_filter(
|
|
| 454 |
# Results are verified despite low entanglement
|
| 455 |
logger.info(
|
| 456 |
f"✅ BOB VERIFIED: Query {
|
| 457 |
-
query.query_id} is consistent across stress tests. "
|
|
|
|
| 458 |
verification_log.get(
|
| 459 |
'consistency_score',
|
| 460 |
-
0.0):.3f})"
|
|
|
|
| 461 |
return "VERIFIED", verification_log
|
| 462 |
else:
|
| 463 |
# Results diverge under stress testing = quarantine
|
| 464 |
logger.warning(
|
| 465 |
f"🚨 BOB QUARANTINE: Query {
|
| 466 |
-
query.query_id} FAILED stress tests. "
|
|
|
|
| 467 |
verification_log.get(
|
| 468 |
'consistency_score',
|
| 469 |
-
0.0):.3f}) "
|
|
|
|
|
|
|
| 470 |
return "QUARANTINED", verification_log
|
| 471 |
|
| 472 |
# Results are normal - no investigation needed
|
|
@@ -484,8 +482,7 @@ async def startup_event():
|
|
| 484 |
async def health_check():
|
| 485 |
"""Health check endpoint"""
|
| 486 |
api = _init_api()
|
| 487 |
-
uptime = (datetime.now() -
|
| 488 |
-
datetime.fromisoformat(_metrics["start_time"])).total_seconds()
|
| 489 |
|
| 490 |
return HealthResponse(
|
| 491 |
status="healthy",
|
|
@@ -504,8 +501,7 @@ async def single_query(request: QueryRequest):
|
|
| 504 |
api = _init_api()
|
| 505 |
_metrics["total_queries"] += 1
|
| 506 |
_metrics["concurrent_queries"] += 1
|
| 507 |
-
_metrics["max_concurrent"] = max(
|
| 508 |
-
_metrics["max_concurrent"], _metrics["concurrent_queries"])
|
| 509 |
|
| 510 |
if request.stat7_hybrid:
|
| 511 |
_metrics["hybrid_queries"] += 1
|
|
@@ -514,8 +510,9 @@ async def single_query(request: QueryRequest):
|
|
| 514 |
start_time = time.time()
|
| 515 |
|
| 516 |
# Convert request to RetrievalQuery
|
| 517 |
-
mode = RetrievalMode[
|
| 518 |
-
"_", "").replace("SIMILARITY", "_SIMILARITY")
|
|
|
|
| 519 |
|
| 520 |
stat7_addr = None
|
| 521 |
if request.stat7_address:
|
|
@@ -571,11 +568,9 @@ async def single_query(request: QueryRequest):
|
|
| 571 |
result_count=len(results_data),
|
| 572 |
results=results_data,
|
| 573 |
semantic_similarity=(
|
| 574 |
-
results_data[0].get(
|
| 575 |
-
"semantic_similarity") if results_data else None
|
| 576 |
),
|
| 577 |
-
stat7_resonance=results_data[0].get(
|
| 578 |
-
"stat7_resonance") if results_data else None,
|
| 579 |
execution_time_ms=execution_time,
|
| 580 |
timestamp=datetime.now().isoformat(),
|
| 581 |
narrative_analysis=narrative_analysis,
|
|
@@ -600,7 +595,8 @@ async def bulk_concurrent_queries(request: BulkQueryRequest):
|
|
| 600 |
f"Executing {
|
| 601 |
len(
|
| 602 |
request.queries)} queries with concurrency level {
|
| 603 |
-
request.concurrency_level}"
|
|
|
|
| 604 |
|
| 605 |
results = []
|
| 606 |
semaphore = asyncio.Semaphore(request.concurrency_level)
|
|
@@ -615,8 +611,7 @@ async def bulk_concurrent_queries(request: BulkQueryRequest):
|
|
| 615 |
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 616 |
|
| 617 |
# Separate successful results from errors
|
| 618 |
-
successful_results = [
|
| 619 |
-
r for r in batch_results if not isinstance(r, Exception)]
|
| 620 |
|
| 621 |
errors = [
|
| 622 |
{"query_id": request.queries[i].query_id, "error": str(r)}
|
|
@@ -629,8 +624,7 @@ async def bulk_concurrent_queries(request: BulkQueryRequest):
|
|
| 629 |
for result in successful_results:
|
| 630 |
all_results_flat.extend(result.results)
|
| 631 |
|
| 632 |
-
batch_narrative_analysis = _analyze_narrative_coherence(
|
| 633 |
-
all_results_flat)
|
| 634 |
|
| 635 |
return {
|
| 636 |
"batch_id": f"batch_{int(time.time() * 1000)}",
|
|
@@ -675,21 +669,20 @@ async def ingest_documents(request: Dict[str, Any]):
|
|
| 675 |
continue
|
| 676 |
|
| 677 |
# Use the new add_document method
|
| 678 |
-
success = api.add_document(
|
| 679 |
-
doc_id=content_id, content=content, metadata=metadata)
|
| 680 |
|
| 681 |
if success:
|
| 682 |
ingested += 1
|
| 683 |
logger.info(f"✓ Ingested: {content_id}")
|
| 684 |
else:
|
| 685 |
-
failed.append(
|
| 686 |
-
{"doc_id": content_id, "error": "Document already exists"})
|
| 687 |
logger.warning(f"Document already exists: {content_id}")
|
| 688 |
|
| 689 |
logger.info(
|
| 690 |
f"Ingested {ingested}/{
|
| 691 |
len(documents)} documents (context store now has {
|
| 692 |
-
api.get_context_store_size()} total)"
|
|
|
|
| 693 |
|
| 694 |
response = {
|
| 695 |
"status": "success",
|
|
|
|
| 47 |
"""STAT7 coordinate specification"""
|
| 48 |
|
| 49 |
realm: Dict[str, Any] = Field(
|
| 50 |
+
default_factory=lambda: {"type": "retrieval_query", "label": "api_query"}
|
|
|
|
| 51 |
)
|
| 52 |
lineage: int = 0
|
| 53 |
adjacency: str = "semantic_proximity"
|
|
|
|
| 127 |
return _api_instance
|
| 128 |
|
| 129 |
|
| 130 |
+
def _analyze_narrative_coherence(results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
|
|
| 131 |
"""
|
| 132 |
Analyze narrative coherence across results.
|
| 133 |
Used to validate that meaning/story threads survive concurrent access.
|
|
|
|
| 177 |
relevance_scores.append(result.get("relevance_score", 0.0))
|
| 178 |
|
| 179 |
# Calculate coherence components
|
| 180 |
+
avg_semantic = sum(semantic_scores) / len(semantic_scores) if semantic_scores else 0.0
|
| 181 |
+
avg_stat7 = sum(stat7_resonances) / len(stat7_resonances) if stat7_resonances else 0.0
|
| 182 |
+
avg_relevance = sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0.0
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
# 1. RESULT QUALITY (50% weight): Average relevance of all results
|
| 185 |
# This is the primary signal - if results aren't relevant, nothing else
|
|
|
|
| 192 |
semantic_variance = sum((s - avg_semantic) ** 2 for s in semantic_scores) / max(
|
| 193 |
1, len(semantic_scores)
|
| 194 |
)
|
| 195 |
+
semantic_coherence = 1.0 / (1.0 + semantic_variance) if semantic_variance < 1.0 else 0.0
|
|
|
|
| 196 |
|
| 197 |
# 3. STAT7 ENTANGLEMENT (10% weight): Are results connected in STAT7 space?
|
| 198 |
stat7_coherence = avg_stat7
|
|
|
|
| 225 |
if len(results) > 50: # Only log for bulk operations
|
| 226 |
logger.info(
|
| 227 |
f"Coherence analysis for {
|
| 228 |
+
len(results)} results: "
|
| 229 |
+
f"quality={
|
| 230 |
quality_score:.3f}, semantic_coh={
|
| 231 |
semantic_coherence:.3f} (var={
|
| 232 |
+
semantic_variance:.4f}), "
|
| 233 |
+
f"stat7={
|
| 234 |
stat7_coherence:.3f}, focus={
|
| 235 |
+
focus_coherence:.3f}, "
|
| 236 |
+
f"threads={
|
| 237 |
len(narrative_threads)}, final={
|
| 238 |
+
coherence_score:.3f}"
|
| 239 |
+
)
|
| 240 |
|
| 241 |
return {
|
| 242 |
"coherence_score": coherence_score,
|
|
|
|
| 304 |
if query.stat7_hybrid and query.semantic_query:
|
| 305 |
logger.info(
|
| 306 |
f"Bob Test 1: Pure semantic retrieval for query {
|
| 307 |
+
query.query_id}"
|
| 308 |
+
)
|
| 309 |
semantic_query = RetrievalQuery(
|
| 310 |
query_id=f"{query.query_id}_bob_semantic",
|
| 311 |
mode=RetrievalMode.SEMANTIC_SIMILARITY,
|
|
|
|
| 315 |
)
|
| 316 |
semantic_assembly = api.retrieve_context(semantic_query)
|
| 317 |
semantic_ids = set(r.content_id for r in semantic_assembly.results)
|
| 318 |
+
semantic_overlap = len(original_ids & semantic_ids) / max(1, len(original_ids))
|
|
|
|
| 319 |
|
| 320 |
log["tests_run"].append(
|
| 321 |
{
|
|
|
|
| 327 |
|
| 328 |
# Test 2: Pure STAT7 retrieval (if hybrid was used)
|
| 329 |
if query.stat7_hybrid and query.stat7_address:
|
| 330 |
+
logger.info(f"Bob Test 2: Pure STAT7 retrieval for query {query.query_id}")
|
|
|
|
| 331 |
stat7_query = RetrievalQuery(
|
| 332 |
query_id=f"{query.query_id}_bob_stat7",
|
| 333 |
mode=RetrievalMode.STAT7_ADDRESS,
|
|
|
|
| 337 |
)
|
| 338 |
stat7_assembly = api.retrieve_context(stat7_query)
|
| 339 |
stat7_ids = set(r.content_id for r in stat7_assembly.results)
|
| 340 |
+
stat7_overlap = len(original_ids & stat7_ids) / max(1, len(original_ids))
|
|
|
|
| 341 |
|
| 342 |
log["tests_run"].append(
|
| 343 |
{
|
|
|
|
| 352 |
if query.confidence_threshold < 0.8:
|
| 353 |
logger.info(
|
| 354 |
f"Bob Test 3: Higher confidence threshold for query {
|
| 355 |
+
query.query_id}"
|
| 356 |
+
)
|
| 357 |
high_conf_query = RetrievalQuery(
|
| 358 |
query_id=f"{query.query_id}_bob_high_conf",
|
| 359 |
mode=query.mode,
|
| 360 |
semantic_query=query.semantic_query,
|
| 361 |
anchor_ids=query.anchor_ids,
|
| 362 |
max_results=query.max_results,
|
| 363 |
+
confidence_threshold=min(0.85, query.confidence_threshold + 0.2),
|
|
|
|
| 364 |
stat7_hybrid=query.stat7_hybrid,
|
| 365 |
stat7_address=query.stat7_address,
|
| 366 |
)
|
| 367 |
high_conf_assembly = api.retrieve_context(high_conf_query)
|
| 368 |
+
high_conf_ids = set(r.content_id for r in high_conf_assembly.results)
|
|
|
|
| 369 |
high_conf_overlap = len(original_ids & high_conf_ids) / max(
|
| 370 |
1, min(len(original_ids), len(high_conf_ids))
|
| 371 |
)
|
|
|
|
| 385 |
avg_consistency = sum(consistency_scores) / len(consistency_scores)
|
| 386 |
log["consistency_score"] = avg_consistency
|
| 387 |
|
| 388 |
+
if avg_consistency >= (1.0 - BobSkepticConfig.STRESS_TEST_DIVERGENCE_THRESHOLD):
|
|
|
|
| 389 |
log["verdict"] = "CONSISTENT"
|
| 390 |
is_consistent = True
|
| 391 |
else:
|
|
|
|
| 399 |
log["stress_test_completed"] = datetime.now().isoformat()
|
| 400 |
|
| 401 |
except Exception as e:
|
| 402 |
+
logger.error(f"Error during Bob's stress test for {query.query_id}: {str(e)}")
|
|
|
|
| 403 |
log["error"] = str(e)
|
| 404 |
log["verdict"] = "ERROR_DURING_TEST"
|
| 405 |
is_consistent = False # Err on side of caution
|
|
|
|
| 447 |
# Results are verified despite low entanglement
|
| 448 |
logger.info(
|
| 449 |
f"✅ BOB VERIFIED: Query {
|
| 450 |
+
query.query_id} is consistent across stress tests. "
|
| 451 |
+
f"High coherence is genuine, not an artifact. (consistency={
|
| 452 |
verification_log.get(
|
| 453 |
'consistency_score',
|
| 454 |
+
0.0):.3f})"
|
| 455 |
+
)
|
| 456 |
return "VERIFIED", verification_log
|
| 457 |
else:
|
| 458 |
# Results diverge under stress testing = quarantine
|
| 459 |
logger.warning(
|
| 460 |
f"🚨 BOB QUARANTINE: Query {
|
| 461 |
+
query.query_id} FAILED stress tests. "
|
| 462 |
+
f"High coherence appears to be artifact or dataset bias. (consistency={
|
| 463 |
verification_log.get(
|
| 464 |
'consistency_score',
|
| 465 |
+
0.0):.3f}) "
|
| 466 |
+
f"Escalating to Faculty for review."
|
| 467 |
+
)
|
| 468 |
return "QUARANTINED", verification_log
|
| 469 |
|
| 470 |
# Results are normal - no investigation needed
|
|
|
|
| 482 |
async def health_check():
|
| 483 |
"""Health check endpoint"""
|
| 484 |
api = _init_api()
|
| 485 |
+
uptime = (datetime.now() - datetime.fromisoformat(_metrics["start_time"])).total_seconds()
|
|
|
|
| 486 |
|
| 487 |
return HealthResponse(
|
| 488 |
status="healthy",
|
|
|
|
| 501 |
api = _init_api()
|
| 502 |
_metrics["total_queries"] += 1
|
| 503 |
_metrics["concurrent_queries"] += 1
|
| 504 |
+
_metrics["max_concurrent"] = max(_metrics["max_concurrent"], _metrics["concurrent_queries"])
|
|
|
|
| 505 |
|
| 506 |
if request.stat7_hybrid:
|
| 507 |
_metrics["hybrid_queries"] += 1
|
|
|
|
| 510 |
start_time = time.time()
|
| 511 |
|
| 512 |
# Convert request to RetrievalQuery
|
| 513 |
+
mode = RetrievalMode[
|
| 514 |
+
request.mode.upper().replace("_", "").replace("SIMILARITY", "_SIMILARITY")
|
| 515 |
+
]
|
| 516 |
|
| 517 |
stat7_addr = None
|
| 518 |
if request.stat7_address:
|
|
|
|
| 568 |
result_count=len(results_data),
|
| 569 |
results=results_data,
|
| 570 |
semantic_similarity=(
|
| 571 |
+
results_data[0].get("semantic_similarity") if results_data else None
|
|
|
|
| 572 |
),
|
| 573 |
+
stat7_resonance=results_data[0].get("stat7_resonance") if results_data else None,
|
|
|
|
| 574 |
execution_time_ms=execution_time,
|
| 575 |
timestamp=datetime.now().isoformat(),
|
| 576 |
narrative_analysis=narrative_analysis,
|
|
|
|
| 595 |
f"Executing {
|
| 596 |
len(
|
| 597 |
request.queries)} queries with concurrency level {
|
| 598 |
+
request.concurrency_level}"
|
| 599 |
+
)
|
| 600 |
|
| 601 |
results = []
|
| 602 |
semaphore = asyncio.Semaphore(request.concurrency_level)
|
|
|
|
| 611 |
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 612 |
|
| 613 |
# Separate successful results from errors
|
| 614 |
+
successful_results = [r for r in batch_results if not isinstance(r, Exception)]
|
|
|
|
| 615 |
|
| 616 |
errors = [
|
| 617 |
{"query_id": request.queries[i].query_id, "error": str(r)}
|
|
|
|
| 624 |
for result in successful_results:
|
| 625 |
all_results_flat.extend(result.results)
|
| 626 |
|
| 627 |
+
batch_narrative_analysis = _analyze_narrative_coherence(all_results_flat)
|
|
|
|
| 628 |
|
| 629 |
return {
|
| 630 |
"batch_id": f"batch_{int(time.time() * 1000)}",
|
|
|
|
| 669 |
continue
|
| 670 |
|
| 671 |
# Use the new add_document method
|
| 672 |
+
success = api.add_document(doc_id=content_id, content=content, metadata=metadata)
|
|
|
|
| 673 |
|
| 674 |
if success:
|
| 675 |
ingested += 1
|
| 676 |
logger.info(f"✓ Ingested: {content_id}")
|
| 677 |
else:
|
| 678 |
+
failed.append({"doc_id": content_id, "error": "Document already exists"})
|
|
|
|
| 679 |
logger.warning(f"Document already exists: {content_id}")
|
| 680 |
|
| 681 |
logger.info(
|
| 682 |
f"Ingested {ingested}/{
|
| 683 |
len(documents)} documents (context store now has {
|
| 684 |
+
api.get_context_store_size()} total)"
|
| 685 |
+
)
|
| 686 |
|
| 687 |
response = {
|
| 688 |
"status": "success",
|
warbler_cda/conflict_detector.py
CHANGED
|
@@ -65,22 +65,15 @@ class ConflictDetector:
|
|
| 65 |
- Confidence scoring and evidence collection
|
| 66 |
"""
|
| 67 |
|
| 68 |
-
def __init__(self,
|
| 69 |
-
config: Optional[Dict[str,
|
| 70 |
-
Any]] = None,
|
| 71 |
-
embedding_provider=None):
|
| 72 |
self.config = config or {}
|
| 73 |
self.embedding_provider = embedding_provider
|
| 74 |
|
| 75 |
# Configuration parameters
|
| 76 |
-
self.opposition_threshold = self.config.get(
|
| 77 |
-
|
| 78 |
-
self.
|
| 79 |
-
|
| 80 |
-
self.min_confidence_score = self.config.get(
|
| 81 |
-
"min_confidence_score", 0.6)
|
| 82 |
-
self.max_statement_age_hours = self.config.get(
|
| 83 |
-
"max_statement_age_hours", 24)
|
| 84 |
|
| 85 |
# Storage
|
| 86 |
self.statement_fingerprints: Dict[str, StatementFingerprint] = {}
|
|
@@ -152,8 +145,7 @@ class ConflictDetector:
|
|
| 152 |
"average_confidence": 0.0,
|
| 153 |
}
|
| 154 |
|
| 155 |
-
def process_statements(
|
| 156 |
-
self, statements: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 157 |
"""
|
| 158 |
Process new statements and detect conflicts with existing statements.
|
| 159 |
|
|
@@ -169,10 +161,7 @@ class ConflictDetector:
|
|
| 169 |
"new_conflicts": [],
|
| 170 |
"fingerprints_created": 0,
|
| 171 |
"total_active_statements": 0,
|
| 172 |
-
"conflict_summary": {
|
| 173 |
-
"high_confidence": 0,
|
| 174 |
-
"medium_confidence": 0,
|
| 175 |
-
"low_confidence": 0},
|
| 176 |
}
|
| 177 |
|
| 178 |
# Process each statement
|
|
@@ -184,8 +173,7 @@ class ConflictDetector:
|
|
| 184 |
continue
|
| 185 |
|
| 186 |
# Create fingerprint for new statement
|
| 187 |
-
fingerprint = self._create_statement_fingerprint(
|
| 188 |
-
statement_id, content, statement)
|
| 189 |
self.statement_fingerprints[statement_id] = fingerprint
|
| 190 |
processing_report["fingerprints_created"] += 1
|
| 191 |
|
|
@@ -203,7 +191,8 @@ class ConflictDetector:
|
|
| 203 |
"conflict_type": conflict.conflict_type.value,
|
| 204 |
"confidence_score": conflict.confidence_score,
|
| 205 |
"opposition_indicators": conflict.opposition_indicators,
|
| 206 |
-
}
|
|
|
|
| 207 |
|
| 208 |
# Categorize by confidence
|
| 209 |
if conflict.confidence_score >= 0.8:
|
|
@@ -219,8 +208,7 @@ class ConflictDetector:
|
|
| 219 |
# Update metrics
|
| 220 |
elapsed_ms = (time.time() - start_time) * 1000
|
| 221 |
self.metrics["statements_processed"] += len(statements)
|
| 222 |
-
self.metrics["conflicts_detected"] += len(
|
| 223 |
-
processing_report["new_conflicts"])
|
| 224 |
self.metrics["processing_time_ms"] += elapsed_ms
|
| 225 |
|
| 226 |
if self.detected_conflicts:
|
|
@@ -229,10 +217,8 @@ class ConflictDetector:
|
|
| 229 |
) / len(self.detected_conflicts)
|
| 230 |
|
| 231 |
processing_report["elapsed_ms"] = elapsed_ms
|
| 232 |
-
processing_report["total_active_statements"] = len(
|
| 233 |
-
|
| 234 |
-
processing_report["total_conflicts_detected"] = len(
|
| 235 |
-
self.detected_conflicts)
|
| 236 |
|
| 237 |
return processing_report
|
| 238 |
|
|
@@ -263,8 +249,7 @@ class ConflictDetector:
|
|
| 263 |
|
| 264 |
for conflict in conflicts_involving_statement:
|
| 265 |
conflict_type = conflict.conflict_type.value
|
| 266 |
-
conflict_types[conflict_type] = conflict_types.get(
|
| 267 |
-
conflict_type, 0) + 1
|
| 268 |
max_confidence = max(max_confidence, conflict.confidence_score)
|
| 269 |
|
| 270 |
# Add opposing statement
|
|
@@ -324,8 +309,7 @@ class ConflictDetector:
|
|
| 324 |
for conflict in self.detected_conflicts:
|
| 325 |
# Count by type
|
| 326 |
conflict_type = conflict.conflict_type.value
|
| 327 |
-
conflict_types[conflict_type] = conflict_types.get(
|
| 328 |
-
conflict_type, 0) + 1
|
| 329 |
|
| 330 |
# Count by confidence
|
| 331 |
if conflict.confidence_score >= 0.8:
|
|
@@ -351,8 +335,7 @@ class ConflictDetector:
|
|
| 351 |
status = "healthy"
|
| 352 |
|
| 353 |
health_score = self._calculate_health_score()
|
| 354 |
-
recommendations = self._generate_system_recommendations(
|
| 355 |
-
status, conflict_types)
|
| 356 |
|
| 357 |
return {
|
| 358 |
"total_conflicts": len(self.detected_conflicts),
|
|
@@ -402,16 +385,19 @@ class ConflictDetector:
|
|
| 402 |
# Detect negation indicators
|
| 403 |
content_lower = content.lower()
|
| 404 |
negation_indicators = [
|
| 405 |
-
pattern for pattern in self.negation_patterns if pattern in content_lower
|
|
|
|
| 406 |
|
| 407 |
# Calculate assertion strength
|
| 408 |
assertion_indicators = [
|
| 409 |
-
pattern for pattern in self.assertion_patterns if pattern in content_lower
|
|
|
|
| 410 |
assertion_strength = min(len(assertion_indicators) * 0.2, 1.0)
|
| 411 |
|
| 412 |
# Extract temporal markers
|
| 413 |
temporal_markers = [
|
| 414 |
-
pattern for pattern in self.temporal_patterns if pattern in content_lower
|
|
|
|
| 415 |
|
| 416 |
# Extract domain tags (simple keyword-based)
|
| 417 |
domain_tags = set()
|
|
@@ -467,8 +453,7 @@ class ConflictDetector:
|
|
| 467 |
context_overlap = len(
|
| 468 |
new_fingerprint.domain_tags & existing_fingerprint.domain_tags
|
| 469 |
) / max(
|
| 470 |
-
len(new_fingerprint.domain_tags |
|
| 471 |
-
existing_fingerprint.domain_tags), 1
|
| 472 |
)
|
| 473 |
|
| 474 |
# Collect opposition evidence
|
|
@@ -477,14 +462,12 @@ class ConflictDetector:
|
|
| 477 |
new_fingerprint.negation_indicators
|
| 478 |
and not existing_fingerprint.negation_indicators
|
| 479 |
):
|
| 480 |
-
opposition_indicators.extend(
|
| 481 |
-
new_fingerprint.negation_indicators)
|
| 482 |
elif (
|
| 483 |
existing_fingerprint.negation_indicators
|
| 484 |
and not new_fingerprint.negation_indicators
|
| 485 |
):
|
| 486 |
-
opposition_indicators.extend(
|
| 487 |
-
existing_fingerprint.negation_indicators)
|
| 488 |
|
| 489 |
# Determine conflict type
|
| 490 |
conflict_type = self._determine_conflict_type(
|
|
@@ -493,7 +476,8 @@ class ConflictDetector:
|
|
| 493 |
|
| 494 |
# Calculate confidence score
|
| 495 |
confidence = self._calculate_confidence_score(
|
| 496 |
-
similarity, opposition_score, context_overlap, opposition_indicators
|
|
|
|
| 497 |
|
| 498 |
if confidence >= self.min_confidence_score:
|
| 499 |
conflict = ConflictEvidence(
|
|
@@ -530,13 +514,9 @@ class ConflictDetector:
|
|
| 530 |
# Temporal conflicts
|
| 531 |
if fp1.temporal_markers and fp2.temporal_markers:
|
| 532 |
# Simple temporal conflict detection
|
| 533 |
-
if any(
|
| 534 |
-
marker in [
|
| 535 |
-
|
| 536 |
-
"earlier"] for marker in fp1.temporal_markers) and any(
|
| 537 |
-
marker in [
|
| 538 |
-
"after",
|
| 539 |
-
"later"] for marker in fp2.temporal_markers):
|
| 540 |
score += 0.3
|
| 541 |
|
| 542 |
return min(score, 1.0)
|
|
@@ -570,8 +550,7 @@ class ConflictDetector:
|
|
| 570 |
indicators: List[str],
|
| 571 |
) -> float:
|
| 572 |
"""Calculate confidence score for a conflict detection."""
|
| 573 |
-
base_score = (similarity * 0.4) +
|
| 574 |
-
(opposition_score * 0.4) + (context_overlap * 0.2)
|
| 575 |
|
| 576 |
# Boost confidence if we have clear opposition indicators
|
| 577 |
indicator_boost = min(len(indicators) * 0.1, 0.2)
|
|
@@ -601,11 +580,10 @@ class ConflictDetector:
|
|
| 601 |
|
| 602 |
def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
|
| 603 |
"""Generate unique ID for a conflict."""
|
| 604 |
-
content =
|
| 605 |
-
f"{
|
| 606 |
conflict.statement_a_id}_{
|
| 607 |
conflict.statement_b_id}_{
|
| 608 |
-
conflict.conflict_type.value}"
|
| 609 |
return hashlib.md5(content.encode()).hexdigest()[:12]
|
| 610 |
|
| 611 |
def _generate_conflict_recommendation(
|
|
@@ -629,12 +607,10 @@ class ConflictDetector:
|
|
| 629 |
recommendations = []
|
| 630 |
|
| 631 |
if status == "critical":
|
| 632 |
-
recommendations.append(
|
| 633 |
-
"Immediate review required - multiple high-confidence conflicts")
|
| 634 |
recommendations.append("Consider statement validation workflow")
|
| 635 |
elif status == "warning":
|
| 636 |
-
recommendations.append(
|
| 637 |
-
"Monitor conflicts closely - elevated conflict level")
|
| 638 |
recommendations.append("Review recent statements for accuracy")
|
| 639 |
|
| 640 |
# Type-specific recommendations
|
|
@@ -644,12 +620,10 @@ class ConflictDetector:
|
|
| 644 |
)
|
| 645 |
|
| 646 |
if conflict_types.get("temporal_conflict", 0) > 2:
|
| 647 |
-
recommendations.append(
|
| 648 |
-
"Temporal conflicts detected - verify timeline consistency")
|
| 649 |
|
| 650 |
if not recommendations:
|
| 651 |
-
recommendations.append(
|
| 652 |
-
"System operating normally - continue monitoring")
|
| 653 |
|
| 654 |
return recommendations
|
| 655 |
|
|
@@ -659,7 +633,8 @@ class ConflictDetector:
|
|
| 659 |
return 1.0
|
| 660 |
|
| 661 |
high_confidence_conflicts = sum(
|
| 662 |
-
1 for conflict in self.detected_conflicts if conflict.confidence_score > 0.8
|
|
|
|
| 663 |
|
| 664 |
total_statements = len(self.statement_fingerprints)
|
| 665 |
if total_statements == 0:
|
|
|
|
| 65 |
- Confidence scoring and evidence collection
|
| 66 |
"""
|
| 67 |
|
| 68 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None, embedding_provider=None):
|
|
|
|
|
|
|
|
|
|
| 69 |
self.config = config or {}
|
| 70 |
self.embedding_provider = embedding_provider
|
| 71 |
|
| 72 |
# Configuration parameters
|
| 73 |
+
self.opposition_threshold = self.config.get("opposition_threshold", 0.7)
|
| 74 |
+
self.semantic_similarity_threshold = self.config.get("semantic_similarity_threshold", 0.8)
|
| 75 |
+
self.min_confidence_score = self.config.get("min_confidence_score", 0.6)
|
| 76 |
+
self.max_statement_age_hours = self.config.get("max_statement_age_hours", 24)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# Storage
|
| 79 |
self.statement_fingerprints: Dict[str, StatementFingerprint] = {}
|
|
|
|
| 145 |
"average_confidence": 0.0,
|
| 146 |
}
|
| 147 |
|
| 148 |
+
def process_statements(self, statements: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
|
|
| 149 |
"""
|
| 150 |
Process new statements and detect conflicts with existing statements.
|
| 151 |
|
|
|
|
| 161 |
"new_conflicts": [],
|
| 162 |
"fingerprints_created": 0,
|
| 163 |
"total_active_statements": 0,
|
| 164 |
+
"conflict_summary": {"high_confidence": 0, "medium_confidence": 0, "low_confidence": 0},
|
|
|
|
|
|
|
|
|
|
| 165 |
}
|
| 166 |
|
| 167 |
# Process each statement
|
|
|
|
| 173 |
continue
|
| 174 |
|
| 175 |
# Create fingerprint for new statement
|
| 176 |
+
fingerprint = self._create_statement_fingerprint(statement_id, content, statement)
|
|
|
|
| 177 |
self.statement_fingerprints[statement_id] = fingerprint
|
| 178 |
processing_report["fingerprints_created"] += 1
|
| 179 |
|
|
|
|
| 191 |
"conflict_type": conflict.conflict_type.value,
|
| 192 |
"confidence_score": conflict.confidence_score,
|
| 193 |
"opposition_indicators": conflict.opposition_indicators,
|
| 194 |
+
}
|
| 195 |
+
)
|
| 196 |
|
| 197 |
# Categorize by confidence
|
| 198 |
if conflict.confidence_score >= 0.8:
|
|
|
|
| 208 |
# Update metrics
|
| 209 |
elapsed_ms = (time.time() - start_time) * 1000
|
| 210 |
self.metrics["statements_processed"] += len(statements)
|
| 211 |
+
self.metrics["conflicts_detected"] += len(processing_report["new_conflicts"])
|
|
|
|
| 212 |
self.metrics["processing_time_ms"] += elapsed_ms
|
| 213 |
|
| 214 |
if self.detected_conflicts:
|
|
|
|
| 217 |
) / len(self.detected_conflicts)
|
| 218 |
|
| 219 |
processing_report["elapsed_ms"] = elapsed_ms
|
| 220 |
+
processing_report["total_active_statements"] = len(self.statement_fingerprints)
|
| 221 |
+
processing_report["total_conflicts_detected"] = len(self.detected_conflicts)
|
|
|
|
|
|
|
| 222 |
|
| 223 |
return processing_report
|
| 224 |
|
|
|
|
| 249 |
|
| 250 |
for conflict in conflicts_involving_statement:
|
| 251 |
conflict_type = conflict.conflict_type.value
|
| 252 |
+
conflict_types[conflict_type] = conflict_types.get(conflict_type, 0) + 1
|
|
|
|
| 253 |
max_confidence = max(max_confidence, conflict.confidence_score)
|
| 254 |
|
| 255 |
# Add opposing statement
|
|
|
|
| 309 |
for conflict in self.detected_conflicts:
|
| 310 |
# Count by type
|
| 311 |
conflict_type = conflict.conflict_type.value
|
| 312 |
+
conflict_types[conflict_type] = conflict_types.get(conflict_type, 0) + 1
|
|
|
|
| 313 |
|
| 314 |
# Count by confidence
|
| 315 |
if conflict.confidence_score >= 0.8:
|
|
|
|
| 335 |
status = "healthy"
|
| 336 |
|
| 337 |
health_score = self._calculate_health_score()
|
| 338 |
+
recommendations = self._generate_system_recommendations(status, conflict_types)
|
|
|
|
| 339 |
|
| 340 |
return {
|
| 341 |
"total_conflicts": len(self.detected_conflicts),
|
|
|
|
| 385 |
# Detect negation indicators
|
| 386 |
content_lower = content.lower()
|
| 387 |
negation_indicators = [
|
| 388 |
+
pattern for pattern in self.negation_patterns if pattern in content_lower
|
| 389 |
+
]
|
| 390 |
|
| 391 |
# Calculate assertion strength
|
| 392 |
assertion_indicators = [
|
| 393 |
+
pattern for pattern in self.assertion_patterns if pattern in content_lower
|
| 394 |
+
]
|
| 395 |
assertion_strength = min(len(assertion_indicators) * 0.2, 1.0)
|
| 396 |
|
| 397 |
# Extract temporal markers
|
| 398 |
temporal_markers = [
|
| 399 |
+
pattern for pattern in self.temporal_patterns if pattern in content_lower
|
| 400 |
+
]
|
| 401 |
|
| 402 |
# Extract domain tags (simple keyword-based)
|
| 403 |
domain_tags = set()
|
|
|
|
| 453 |
context_overlap = len(
|
| 454 |
new_fingerprint.domain_tags & existing_fingerprint.domain_tags
|
| 455 |
) / max(
|
| 456 |
+
len(new_fingerprint.domain_tags | existing_fingerprint.domain_tags), 1
|
|
|
|
| 457 |
)
|
| 458 |
|
| 459 |
# Collect opposition evidence
|
|
|
|
| 462 |
new_fingerprint.negation_indicators
|
| 463 |
and not existing_fingerprint.negation_indicators
|
| 464 |
):
|
| 465 |
+
opposition_indicators.extend(new_fingerprint.negation_indicators)
|
|
|
|
| 466 |
elif (
|
| 467 |
existing_fingerprint.negation_indicators
|
| 468 |
and not new_fingerprint.negation_indicators
|
| 469 |
):
|
| 470 |
+
opposition_indicators.extend(existing_fingerprint.negation_indicators)
|
|
|
|
| 471 |
|
| 472 |
# Determine conflict type
|
| 473 |
conflict_type = self._determine_conflict_type(
|
|
|
|
| 476 |
|
| 477 |
# Calculate confidence score
|
| 478 |
confidence = self._calculate_confidence_score(
|
| 479 |
+
similarity, opposition_score, context_overlap, opposition_indicators
|
| 480 |
+
)
|
| 481 |
|
| 482 |
if confidence >= self.min_confidence_score:
|
| 483 |
conflict = ConflictEvidence(
|
|
|
|
| 514 |
# Temporal conflicts
|
| 515 |
if fp1.temporal_markers and fp2.temporal_markers:
|
| 516 |
# Simple temporal conflict detection
|
| 517 |
+
if any(marker in ["before", "earlier"] for marker in fp1.temporal_markers) and any(
|
| 518 |
+
marker in ["after", "later"] for marker in fp2.temporal_markers
|
| 519 |
+
):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
score += 0.3
|
| 521 |
|
| 522 |
return min(score, 1.0)
|
|
|
|
| 550 |
indicators: List[str],
|
| 551 |
) -> float:
|
| 552 |
"""Calculate confidence score for a conflict detection."""
|
| 553 |
+
base_score = (similarity * 0.4) + (opposition_score * 0.4) + (context_overlap * 0.2)
|
|
|
|
| 554 |
|
| 555 |
# Boost confidence if we have clear opposition indicators
|
| 556 |
indicator_boost = min(len(indicators) * 0.1, 0.2)
|
|
|
|
| 580 |
|
| 581 |
def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
|
| 582 |
"""Generate unique ID for a conflict."""
|
| 583 |
+
content = f"{
|
|
|
|
| 584 |
conflict.statement_a_id}_{
|
| 585 |
conflict.statement_b_id}_{
|
| 586 |
+
conflict.conflict_type.value}"
|
| 587 |
return hashlib.md5(content.encode()).hexdigest()[:12]
|
| 588 |
|
| 589 |
def _generate_conflict_recommendation(
|
|
|
|
| 607 |
recommendations = []
|
| 608 |
|
| 609 |
if status == "critical":
|
| 610 |
+
recommendations.append("Immediate review required - multiple high-confidence conflicts")
|
|
|
|
| 611 |
recommendations.append("Consider statement validation workflow")
|
| 612 |
elif status == "warning":
|
| 613 |
+
recommendations.append("Monitor conflicts closely - elevated conflict level")
|
|
|
|
| 614 |
recommendations.append("Review recent statements for accuracy")
|
| 615 |
|
| 616 |
# Type-specific recommendations
|
|
|
|
| 620 |
)
|
| 621 |
|
| 622 |
if conflict_types.get("temporal_conflict", 0) > 2:
|
| 623 |
+
recommendations.append("Temporal conflicts detected - verify timeline consistency")
|
|
|
|
| 624 |
|
| 625 |
if not recommendations:
|
| 626 |
+
recommendations.append("System operating normally - continue monitoring")
|
|
|
|
| 627 |
|
| 628 |
return recommendations
|
| 629 |
|
|
|
|
| 633 |
return 1.0
|
| 634 |
|
| 635 |
high_confidence_conflicts = sum(
|
| 636 |
+
1 for conflict in self.detected_conflicts if conflict.confidence_score > 0.8
|
| 637 |
+
)
|
| 638 |
|
| 639 |
total_statements = len(self.statement_fingerprints)
|
| 640 |
if total_statements == 0:
|
warbler_cda/evaporation.py
CHANGED
|
@@ -9,8 +9,7 @@ from collections import Counter
|
|
| 9 |
class EvaporationEngine:
|
| 10 |
"""Evaporation: converts molten glyphs into mist lines (proto-thoughts) with advanced style bias."""
|
| 11 |
|
| 12 |
-
def __init__(self, magma_store, cloud_store,
|
| 13 |
-
config: Optional[Dict[str, Any]] = None):
|
| 14 |
self.magma_store = magma_store
|
| 15 |
self.cloud_store = cloud_store
|
| 16 |
self.config = config or {}
|
|
@@ -23,15 +22,12 @@ class EvaporationEngine:
|
|
| 23 |
# Language generation parameters
|
| 24 |
self.creativity_level = self.config.get("creativity_level", 0.7)
|
| 25 |
self.compression_ratio = self.config.get("compression_ratio", 0.6)
|
| 26 |
-
self.mythic_amplification = self.config.get(
|
| 27 |
-
"mythic_amplification", 1.2)
|
| 28 |
|
| 29 |
# Advanced distillation parameters
|
| 30 |
-
self.semantic_density_threshold = self.config.get(
|
| 31 |
-
"semantic_density_threshold", 0.5)
|
| 32 |
self.affect_sensitivity = self.config.get("affect_sensitivity", 0.8)
|
| 33 |
-
self.temporal_decay_factor = self.config.get(
|
| 34 |
-
"temporal_decay_factor", 0.1)
|
| 35 |
|
| 36 |
def evaporate(
|
| 37 |
self, limit: int = 5, style_override: Optional[str] = None
|
|
@@ -43,8 +39,7 @@ class EvaporationEngine:
|
|
| 43 |
mist_lines = []
|
| 44 |
for i, glyph in enumerate(molten[:limit]):
|
| 45 |
# Apply style variation for diversity
|
| 46 |
-
current_style = style_override or self._determine_style_for_glyph(
|
| 47 |
-
glyph, i)
|
| 48 |
|
| 49 |
# Advanced mist distillation
|
| 50 |
mist = self._advanced_distill_mist(glyph, current_style)
|
|
@@ -62,11 +57,9 @@ class EvaporationEngine:
|
|
| 62 |
|
| 63 |
return mist_lines
|
| 64 |
|
| 65 |
-
def _select_optimal_glyphs(
|
| 66 |
-
self, target_count: int) -> List[Dict[str, Any]]:
|
| 67 |
"""Select optimal glyphs for evaporation based on multiple criteria."""
|
| 68 |
-
molten = self.magma_store.select_hot(
|
| 69 |
-
target_count * 3) # Get more candidates
|
| 70 |
|
| 71 |
# Score glyphs based on multiple factors
|
| 72 |
scored_glyphs = []
|
|
@@ -80,8 +73,7 @@ class EvaporationEngine:
|
|
| 80 |
|
| 81 |
# Affect diversity factor
|
| 82 |
affect = glyph.get("affect", {})
|
| 83 |
-
affect_score = sum(abs(v)
|
| 84 |
-
for v in affect.values()) / max(len(affect), 1)
|
| 85 |
score += affect_score * 0.3
|
| 86 |
|
| 87 |
# Temporal freshness factor
|
|
@@ -100,8 +92,7 @@ class EvaporationEngine:
|
|
| 100 |
scored_glyphs.sort(key=lambda x: x[1], reverse=True)
|
| 101 |
return [glyph for glyph, _ in scored_glyphs[:target_count]]
|
| 102 |
|
| 103 |
-
def _determine_style_for_glyph(
|
| 104 |
-
self, glyph: Dict[str, Any], index: int) -> str:
|
| 105 |
"""Determine optimal style for a specific glyph."""
|
| 106 |
affect = glyph.get("affect", {})
|
| 107 |
heat = glyph.get("heat", 0.0)
|
|
@@ -128,27 +119,22 @@ class EvaporationEngine:
|
|
| 128 |
|
| 129 |
return base_style
|
| 130 |
|
| 131 |
-
def _advanced_distill_mist(
|
| 132 |
-
self, glyph: Dict[str, Any], style: str) -> Dict[str, Any]:
|
| 133 |
"""Advanced mist distillation with style bias."""
|
| 134 |
summary = glyph.get("compressed_summary", "")
|
| 135 |
affect = glyph.get("affect", {})
|
| 136 |
heat = glyph.get("heat", 0.0)
|
| 137 |
|
| 138 |
# Get style profile
|
| 139 |
-
style_profile = self.style_profiles.get(
|
| 140 |
-
style, self.style_profiles["balanced"])
|
| 141 |
|
| 142 |
# Generate proto-thought with style bias
|
| 143 |
-
proto_thought = self._generate_styled_proto_thought(
|
| 144 |
-
summary, affect, style_profile)
|
| 145 |
|
| 146 |
# Calculate advanced metrics
|
| 147 |
-
evaporation_temp = self._calculate_evaporation_temperature(
|
| 148 |
-
heat, affect)
|
| 149 |
technical_clarity = self._calculate_technical_clarity(summary, style)
|
| 150 |
-
mythic_weight = self._calculate_mythic_weight(
|
| 151 |
-
affect, style) * self.mythic_amplification
|
| 152 |
|
| 153 |
# Create enhanced mist line
|
| 154 |
mist_line = {
|
|
@@ -212,15 +198,13 @@ class EvaporationEngine:
|
|
| 212 |
|
| 213 |
return concepts[:5] # Limit to top 5 concepts
|
| 214 |
|
| 215 |
-
def _apply_poetic_style(
|
| 216 |
-
self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
| 217 |
"""Apply poetic style to proto-thought."""
|
| 218 |
if not concepts:
|
| 219 |
return "[Poetic] Ethereal mist of untold stories..."
|
| 220 |
|
| 221 |
# Poetic connectors and imagery
|
| 222 |
-
poetic_connectors = ["whispers", "dreams",
|
| 223 |
-
"echoes", "shadows", "light", "flow"]
|
| 224 |
poetic_imagery = [
|
| 225 |
"through ancient corridors",
|
| 226 |
"across starlit paths",
|
|
@@ -238,8 +222,7 @@ class EvaporationEngine:
|
|
| 238 |
concepts[0]} and {
|
| 239 |
concepts[1]} {connector} {imagery}."
|
| 240 |
|
| 241 |
-
def _apply_technical_style(
|
| 242 |
-
self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
| 243 |
"""Apply technical style to proto-thought."""
|
| 244 |
if not concepts:
|
| 245 |
return "[Technical] System processing: null input detected."
|
|
@@ -251,8 +234,7 @@ class EvaporationEngine:
|
|
| 251 |
"Architecture review:",
|
| 252 |
"Implementation note:",
|
| 253 |
]
|
| 254 |
-
tech_connectors = ["enables", "facilitates",
|
| 255 |
-
"optimizes", "integrates", "synchronizes"]
|
| 256 |
|
| 257 |
prefix = random.choice(tech_prefixes)
|
| 258 |
connector = random.choice(tech_connectors)
|
|
@@ -264,8 +246,7 @@ class EvaporationEngine:
|
|
| 264 |
concepts[0]} {connector} {
|
| 265 |
concepts[1]} subsystem."
|
| 266 |
|
| 267 |
-
def _apply_narrative_style(
|
| 268 |
-
self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
| 269 |
"""Apply narrative style to proto-thought."""
|
| 270 |
if not concepts:
|
| 271 |
return "[Narrative] Once upon a time, in the realm of forgotten ideas..."
|
|
@@ -277,8 +258,7 @@ class EvaporationEngine:
|
|
| 277 |
"Beyond the horizon,",
|
| 278 |
"Within the tapestry of",
|
| 279 |
]
|
| 280 |
-
narrative_actions = ["emerges", "dances",
|
| 281 |
-
"whispers", "journeys", "transforms"]
|
| 282 |
|
| 283 |
opener = random.choice(narrative_openers)
|
| 284 |
action = random.choice(narrative_actions)
|
|
@@ -290,8 +270,7 @@ class EvaporationEngine:
|
|
| 290 |
concepts[0]} and {
|
| 291 |
concepts[1]} {action} together."
|
| 292 |
|
| 293 |
-
def _apply_mythic_style(
|
| 294 |
-
self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
| 295 |
"""Apply mythic style to proto-thought."""
|
| 296 |
if not concepts:
|
| 297 |
return "[Mythic] From the primordial void, legends are born..."
|
|
@@ -320,8 +299,7 @@ class EvaporationEngine:
|
|
| 320 |
concepts[0]} and {
|
| 321 |
concepts[1]} within {entity}."
|
| 322 |
|
| 323 |
-
def _apply_balanced_style(
|
| 324 |
-
self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
| 325 |
"""Apply balanced style to proto-thought."""
|
| 326 |
if not concepts:
|
| 327 |
return "[Balanced] Contemplation on the nature of existence..."
|
|
@@ -330,13 +308,11 @@ class EvaporationEngine:
|
|
| 330 |
return f"[Balanced] Reflection on {
|
| 331 |
concepts[0]} reveals deeper meaning."
|
| 332 |
else:
|
| 333 |
-
return
|
| 334 |
-
f"[Balanced] The interplay between {
|
| 335 |
concepts[0]} and {
|
| 336 |
-
concepts[1]} creates harmony."
|
| 337 |
|
| 338 |
-
def _apply_affect_coloring(
|
| 339 |
-
self, proto_thought: str, affect: Dict[str, Any]) -> str:
|
| 340 |
"""Apply affect-based coloring to proto-thought."""
|
| 341 |
if not affect:
|
| 342 |
return proto_thought
|
|
@@ -367,15 +343,13 @@ class EvaporationEngine:
|
|
| 367 |
if len(words) > 15:
|
| 368 |
# Keep first, middle, and last parts
|
| 369 |
keep_first = words[:5]
|
| 370 |
-
keep_middle = words[len(words) // 2 - 2: len(words) // 2 + 2]
|
| 371 |
keep_last = words[-3:]
|
| 372 |
-
compressed = keep_first + ["..."] +
|
| 373 |
-
keep_middle + ["..."] + keep_last
|
| 374 |
return " ".join(compressed)
|
| 375 |
return proto_thought
|
| 376 |
|
| 377 |
-
def _calculate_advanced_humidity(
|
| 378 |
-
self, mist_lines: List[Dict[str, Any]]) -> float:
|
| 379 |
"""Calculate sophisticated humidity based on multiple factors."""
|
| 380 |
if not mist_lines:
|
| 381 |
return 0.0
|
|
@@ -390,10 +364,8 @@ class EvaporationEngine:
|
|
| 390 |
mythic_contribution = avg_mythic * 0.3
|
| 391 |
|
| 392 |
# Technical clarity contribution (inverse relationship)
|
| 393 |
-
avg_clarity = sum(m.get("technical_clarity", 0.5)
|
| 394 |
-
|
| 395 |
-
clarity_contribution = (1.0 - avg_clarity) * \
|
| 396 |
-
0.1 # Less clear = more humid
|
| 397 |
|
| 398 |
# Style diversity contribution
|
| 399 |
styles = [m.get("style", "balanced") for m in mist_lines]
|
|
@@ -426,10 +398,8 @@ class EvaporationEngine:
|
|
| 426 |
return
|
| 427 |
|
| 428 |
# Analyze mist characteristics
|
| 429 |
-
avg_mythic = sum(m.get("mythic_weight", 0.0)
|
| 430 |
-
|
| 431 |
-
avg_clarity = sum(m.get("technical_clarity", 0.5)
|
| 432 |
-
for m in mist_lines) / len(mist_lines)
|
| 433 |
styles = [m.get("style", "balanced") for m in mist_lines]
|
| 434 |
style_diversity = len(set(styles)) / len(styles) if styles else 0
|
| 435 |
|
|
@@ -471,14 +441,12 @@ class EvaporationEngine:
|
|
| 471 |
|
| 472 |
return min(1.0, density * length_factor)
|
| 473 |
|
| 474 |
-
def _calculate_evaporation_temperature(
|
| 475 |
-
self, heat: float, affect: Dict[str, Any]) -> float:
|
| 476 |
"""Calculate evaporation temperature based on heat and affect."""
|
| 477 |
base_temp = heat * 0.8 # Primary factor from heat
|
| 478 |
|
| 479 |
# Affect modulation
|
| 480 |
-
affect_intensity = sum(abs(v)
|
| 481 |
-
for v in affect.values()) / max(len(affect), 1)
|
| 482 |
affect_modulation = affect_intensity * 0.2
|
| 483 |
|
| 484 |
# Combine and bound
|
|
@@ -508,17 +476,14 @@ class EvaporationEngine:
|
|
| 508 |
if summary:
|
| 509 |
# Normalize by expected length
|
| 510 |
complexity = len(summary.split()) / 20.0
|
| 511 |
-
complexity_adjustment = max(-0.2,
|
| 512 |
-
min(0.2, (0.5 - complexity) * 0.4))
|
| 513 |
clarity += complexity_adjustment
|
| 514 |
|
| 515 |
return min(1.0, max(0.1, clarity))
|
| 516 |
|
| 517 |
-
def _calculate_mythic_weight(
|
| 518 |
-
self, affect: Dict[str, Any], style: str) -> float:
|
| 519 |
"""Calculate mythic weight based on affect and style."""
|
| 520 |
-
base_weight = affect.get("awe", 0.0) * 0.6 +
|
| 521 |
-
affect.get("wonder", 0.0) * 0.4
|
| 522 |
|
| 523 |
# Style multipliers
|
| 524 |
style_multipliers = {
|
|
@@ -536,8 +501,7 @@ class EvaporationEngine:
|
|
| 536 |
multiplier = style_multipliers.get(style, 1.0)
|
| 537 |
return min(1.0, base_weight * multiplier)
|
| 538 |
|
| 539 |
-
def _create_affect_signature(
|
| 540 |
-
self, affect: Dict[str, Any]) -> Dict[str, float]:
|
| 541 |
"""Create normalized affect signature."""
|
| 542 |
if not affect:
|
| 543 |
return {}
|
|
@@ -574,8 +538,7 @@ class EvaporationEngine:
|
|
| 574 |
proto_concepts = set(self._extract_key_concepts(proto_thought))
|
| 575 |
|
| 576 |
if original_concepts:
|
| 577 |
-
preservation = len(original_concepts &
|
| 578 |
-
proto_concepts) / len(original_concepts)
|
| 579 |
quality_score += preservation * 0.2
|
| 580 |
|
| 581 |
# Affect alignment
|
|
|
|
| 9 |
class EvaporationEngine:
|
| 10 |
"""Evaporation: converts molten glyphs into mist lines (proto-thoughts) with advanced style bias."""
|
| 11 |
|
| 12 |
+
def __init__(self, magma_store, cloud_store, config: Optional[Dict[str, Any]] = None):
|
|
|
|
| 13 |
self.magma_store = magma_store
|
| 14 |
self.cloud_store = cloud_store
|
| 15 |
self.config = config or {}
|
|
|
|
| 22 |
# Language generation parameters
|
| 23 |
self.creativity_level = self.config.get("creativity_level", 0.7)
|
| 24 |
self.compression_ratio = self.config.get("compression_ratio", 0.6)
|
| 25 |
+
self.mythic_amplification = self.config.get("mythic_amplification", 1.2)
|
|
|
|
| 26 |
|
| 27 |
# Advanced distillation parameters
|
| 28 |
+
self.semantic_density_threshold = self.config.get("semantic_density_threshold", 0.5)
|
|
|
|
| 29 |
self.affect_sensitivity = self.config.get("affect_sensitivity", 0.8)
|
| 30 |
+
self.temporal_decay_factor = self.config.get("temporal_decay_factor", 0.1)
|
|
|
|
| 31 |
|
| 32 |
def evaporate(
|
| 33 |
self, limit: int = 5, style_override: Optional[str] = None
|
|
|
|
| 39 |
mist_lines = []
|
| 40 |
for i, glyph in enumerate(molten[:limit]):
|
| 41 |
# Apply style variation for diversity
|
| 42 |
+
current_style = style_override or self._determine_style_for_glyph(glyph, i)
|
|
|
|
| 43 |
|
| 44 |
# Advanced mist distillation
|
| 45 |
mist = self._advanced_distill_mist(glyph, current_style)
|
|
|
|
| 57 |
|
| 58 |
return mist_lines
|
| 59 |
|
| 60 |
+
def _select_optimal_glyphs(self, target_count: int) -> List[Dict[str, Any]]:
|
|
|
|
| 61 |
"""Select optimal glyphs for evaporation based on multiple criteria."""
|
| 62 |
+
molten = self.magma_store.select_hot(target_count * 3) # Get more candidates
|
|
|
|
| 63 |
|
| 64 |
# Score glyphs based on multiple factors
|
| 65 |
scored_glyphs = []
|
|
|
|
| 73 |
|
| 74 |
# Affect diversity factor
|
| 75 |
affect = glyph.get("affect", {})
|
| 76 |
+
affect_score = sum(abs(v) for v in affect.values()) / max(len(affect), 1)
|
|
|
|
| 77 |
score += affect_score * 0.3
|
| 78 |
|
| 79 |
# Temporal freshness factor
|
|
|
|
| 92 |
scored_glyphs.sort(key=lambda x: x[1], reverse=True)
|
| 93 |
return [glyph for glyph, _ in scored_glyphs[:target_count]]
|
| 94 |
|
| 95 |
+
def _determine_style_for_glyph(self, glyph: Dict[str, Any], index: int) -> str:
|
|
|
|
| 96 |
"""Determine optimal style for a specific glyph."""
|
| 97 |
affect = glyph.get("affect", {})
|
| 98 |
heat = glyph.get("heat", 0.0)
|
|
|
|
| 119 |
|
| 120 |
return base_style
|
| 121 |
|
| 122 |
+
def _advanced_distill_mist(self, glyph: Dict[str, Any], style: str) -> Dict[str, Any]:
|
|
|
|
| 123 |
"""Advanced mist distillation with style bias."""
|
| 124 |
summary = glyph.get("compressed_summary", "")
|
| 125 |
affect = glyph.get("affect", {})
|
| 126 |
heat = glyph.get("heat", 0.0)
|
| 127 |
|
| 128 |
# Get style profile
|
| 129 |
+
style_profile = self.style_profiles.get(style, self.style_profiles["balanced"])
|
|
|
|
| 130 |
|
| 131 |
# Generate proto-thought with style bias
|
| 132 |
+
proto_thought = self._generate_styled_proto_thought(summary, affect, style_profile)
|
|
|
|
| 133 |
|
| 134 |
# Calculate advanced metrics
|
| 135 |
+
evaporation_temp = self._calculate_evaporation_temperature(heat, affect)
|
|
|
|
| 136 |
technical_clarity = self._calculate_technical_clarity(summary, style)
|
| 137 |
+
mythic_weight = self._calculate_mythic_weight(affect, style) * self.mythic_amplification
|
|
|
|
| 138 |
|
| 139 |
# Create enhanced mist line
|
| 140 |
mist_line = {
|
|
|
|
| 198 |
|
| 199 |
return concepts[:5] # Limit to top 5 concepts
|
| 200 |
|
| 201 |
+
def _apply_poetic_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
|
|
|
| 202 |
"""Apply poetic style to proto-thought."""
|
| 203 |
if not concepts:
|
| 204 |
return "[Poetic] Ethereal mist of untold stories..."
|
| 205 |
|
| 206 |
# Poetic connectors and imagery
|
| 207 |
+
poetic_connectors = ["whispers", "dreams", "echoes", "shadows", "light", "flow"]
|
|
|
|
| 208 |
poetic_imagery = [
|
| 209 |
"through ancient corridors",
|
| 210 |
"across starlit paths",
|
|
|
|
| 222 |
concepts[0]} and {
|
| 223 |
concepts[1]} {connector} {imagery}."
|
| 224 |
|
| 225 |
+
def _apply_technical_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
|
|
|
| 226 |
"""Apply technical style to proto-thought."""
|
| 227 |
if not concepts:
|
| 228 |
return "[Technical] System processing: null input detected."
|
|
|
|
| 234 |
"Architecture review:",
|
| 235 |
"Implementation note:",
|
| 236 |
]
|
| 237 |
+
tech_connectors = ["enables", "facilitates", "optimizes", "integrates", "synchronizes"]
|
|
|
|
| 238 |
|
| 239 |
prefix = random.choice(tech_prefixes)
|
| 240 |
connector = random.choice(tech_connectors)
|
|
|
|
| 246 |
concepts[0]} {connector} {
|
| 247 |
concepts[1]} subsystem."
|
| 248 |
|
| 249 |
+
def _apply_narrative_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
|
|
|
| 250 |
"""Apply narrative style to proto-thought."""
|
| 251 |
if not concepts:
|
| 252 |
return "[Narrative] Once upon a time, in the realm of forgotten ideas..."
|
|
|
|
| 258 |
"Beyond the horizon,",
|
| 259 |
"Within the tapestry of",
|
| 260 |
]
|
| 261 |
+
narrative_actions = ["emerges", "dances", "whispers", "journeys", "transforms"]
|
|
|
|
| 262 |
|
| 263 |
opener = random.choice(narrative_openers)
|
| 264 |
action = random.choice(narrative_actions)
|
|
|
|
| 270 |
concepts[0]} and {
|
| 271 |
concepts[1]} {action} together."
|
| 272 |
|
| 273 |
+
def _apply_mythic_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
|
|
|
| 274 |
"""Apply mythic style to proto-thought."""
|
| 275 |
if not concepts:
|
| 276 |
return "[Mythic] From the primordial void, legends are born..."
|
|
|
|
| 299 |
concepts[0]} and {
|
| 300 |
concepts[1]} within {entity}."
|
| 301 |
|
| 302 |
+
def _apply_balanced_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
|
|
|
|
| 303 |
"""Apply balanced style to proto-thought."""
|
| 304 |
if not concepts:
|
| 305 |
return "[Balanced] Contemplation on the nature of existence..."
|
|
|
|
| 308 |
return f"[Balanced] Reflection on {
|
| 309 |
concepts[0]} reveals deeper meaning."
|
| 310 |
else:
|
| 311 |
+
return f"[Balanced] The interplay between {
|
|
|
|
| 312 |
concepts[0]} and {
|
| 313 |
+
concepts[1]} creates harmony."
|
| 314 |
|
| 315 |
+
def _apply_affect_coloring(self, proto_thought: str, affect: Dict[str, Any]) -> str:
|
|
|
|
| 316 |
"""Apply affect-based coloring to proto-thought."""
|
| 317 |
if not affect:
|
| 318 |
return proto_thought
|
|
|
|
| 343 |
if len(words) > 15:
|
| 344 |
# Keep first, middle, and last parts
|
| 345 |
keep_first = words[:5]
|
| 346 |
+
keep_middle = words[len(words) // 2 - 2 : len(words) // 2 + 2]
|
| 347 |
keep_last = words[-3:]
|
| 348 |
+
compressed = keep_first + ["..."] + keep_middle + ["..."] + keep_last
|
|
|
|
| 349 |
return " ".join(compressed)
|
| 350 |
return proto_thought
|
| 351 |
|
| 352 |
+
def _calculate_advanced_humidity(self, mist_lines: List[Dict[str, Any]]) -> float:
|
|
|
|
| 353 |
"""Calculate sophisticated humidity based on multiple factors."""
|
| 354 |
if not mist_lines:
|
| 355 |
return 0.0
|
|
|
|
| 364 |
mythic_contribution = avg_mythic * 0.3
|
| 365 |
|
| 366 |
# Technical clarity contribution (inverse relationship)
|
| 367 |
+
avg_clarity = sum(m.get("technical_clarity", 0.5) for m in mist_lines) / len(mist_lines)
|
| 368 |
+
clarity_contribution = (1.0 - avg_clarity) * 0.1 # Less clear = more humid
|
|
|
|
|
|
|
| 369 |
|
| 370 |
# Style diversity contribution
|
| 371 |
styles = [m.get("style", "balanced") for m in mist_lines]
|
|
|
|
| 398 |
return
|
| 399 |
|
| 400 |
# Analyze mist characteristics
|
| 401 |
+
avg_mythic = sum(m.get("mythic_weight", 0.0) for m in mist_lines) / len(mist_lines)
|
| 402 |
+
avg_clarity = sum(m.get("technical_clarity", 0.5) for m in mist_lines) / len(mist_lines)
|
|
|
|
|
|
|
| 403 |
styles = [m.get("style", "balanced") for m in mist_lines]
|
| 404 |
style_diversity = len(set(styles)) / len(styles) if styles else 0
|
| 405 |
|
|
|
|
| 441 |
|
| 442 |
return min(1.0, density * length_factor)
|
| 443 |
|
| 444 |
+
def _calculate_evaporation_temperature(self, heat: float, affect: Dict[str, Any]) -> float:
|
|
|
|
| 445 |
"""Calculate evaporation temperature based on heat and affect."""
|
| 446 |
base_temp = heat * 0.8 # Primary factor from heat
|
| 447 |
|
| 448 |
# Affect modulation
|
| 449 |
+
affect_intensity = sum(abs(v) for v in affect.values()) / max(len(affect), 1)
|
|
|
|
| 450 |
affect_modulation = affect_intensity * 0.2
|
| 451 |
|
| 452 |
# Combine and bound
|
|
|
|
| 476 |
if summary:
|
| 477 |
# Normalize by expected length
|
| 478 |
complexity = len(summary.split()) / 20.0
|
| 479 |
+
complexity_adjustment = max(-0.2, min(0.2, (0.5 - complexity) * 0.4))
|
|
|
|
| 480 |
clarity += complexity_adjustment
|
| 481 |
|
| 482 |
return min(1.0, max(0.1, clarity))
|
| 483 |
|
| 484 |
+
def _calculate_mythic_weight(self, affect: Dict[str, Any], style: str) -> float:
|
|
|
|
| 485 |
"""Calculate mythic weight based on affect and style."""
|
| 486 |
+
base_weight = affect.get("awe", 0.0) * 0.6 + affect.get("wonder", 0.0) * 0.4
|
|
|
|
| 487 |
|
| 488 |
# Style multipliers
|
| 489 |
style_multipliers = {
|
|
|
|
| 501 |
multiplier = style_multipliers.get(style, 1.0)
|
| 502 |
return min(1.0, base_weight * multiplier)
|
| 503 |
|
| 504 |
+
def _create_affect_signature(self, affect: Dict[str, Any]) -> Dict[str, float]:
|
|
|
|
| 505 |
"""Create normalized affect signature."""
|
| 506 |
if not affect:
|
| 507 |
return {}
|
|
|
|
| 538 |
proto_concepts = set(self._extract_key_concepts(proto_thought))
|
| 539 |
|
| 540 |
if original_concepts:
|
| 541 |
+
preservation = len(original_concepts & proto_concepts) / len(original_concepts)
|
|
|
|
| 542 |
quality_score += preservation * 0.2
|
| 543 |
|
| 544 |
# Affect alignment
|
warbler_cda/pack_loader.py
CHANGED
|
@@ -37,14 +37,12 @@ class PackLoader:
|
|
| 37 |
|
| 38 |
pack_docs = self._load_pack(pack_dir, pack_name)
|
| 39 |
documents.extend(pack_docs)
|
| 40 |
-
logger.info(
|
| 41 |
-
f"✓ Loaded {len(pack_docs)} documents from {pack_name}")
|
| 42 |
|
| 43 |
self.documents = documents
|
| 44 |
return documents
|
| 45 |
|
| 46 |
-
def _load_pack(self, pack_dir: Path,
|
| 47 |
-
pack_name: str) -> List[Dict[str, Any]]:
|
| 48 |
"""Load documents from a specific pack"""
|
| 49 |
documents = []
|
| 50 |
|
|
@@ -61,11 +59,7 @@ class PackLoader:
|
|
| 61 |
|
| 62 |
return documents
|
| 63 |
|
| 64 |
-
def _is_valid_warbler_pack(
|
| 65 |
-
self,
|
| 66 |
-
pack_dir: Path,
|
| 67 |
-
pack_name: str,
|
| 68 |
-
jsonl_file: Path) -> bool:
|
| 69 |
"""Validate that a directory is a valid Warbler pack
|
| 70 |
|
| 71 |
A valid Warbler pack must have:
|
|
@@ -105,8 +99,7 @@ class PackLoader:
|
|
| 105 |
# For chunked packs, look for chunk files
|
| 106 |
chunk_files = list(pack_dir.glob(f"{pack_name}-chunk-*.jsonl"))
|
| 107 |
if chunk_files:
|
| 108 |
-
logger.debug(
|
| 109 |
-
f"Found {len(chunk_files)} chunk files for {pack_name}")
|
| 110 |
return True
|
| 111 |
else:
|
| 112 |
logger.warning(f"Chunked pack {pack_name} has no chunk files")
|
|
@@ -116,12 +109,10 @@ class PackLoader:
|
|
| 116 |
if jsonl_file.exists():
|
| 117 |
return True
|
| 118 |
else:
|
| 119 |
-
logger.warning(
|
| 120 |
-
f"Single-file pack {pack_name} missing JSONL file: {jsonl_file}")
|
| 121 |
return False
|
| 122 |
|
| 123 |
-
def _load_jsonl_pack(self, pack_dir: Path,
|
| 124 |
-
pack_name: str) -> List[Dict[str, Any]]:
|
| 125 |
"""Load JSONL-based pack (supports both single-file and chunked packs)"""
|
| 126 |
documents = []
|
| 127 |
|
|
@@ -135,11 +126,9 @@ class PackLoader:
|
|
| 135 |
with open(package_json, "r", encoding="utf-8") as f:
|
| 136 |
metadata = json.load(f)
|
| 137 |
is_chunked = metadata.get("chunked", False)
|
| 138 |
-
chunk_pattern = metadata.get(
|
| 139 |
-
"chunk_pattern", f"{pack_name}-chunk-*.jsonl")
|
| 140 |
except (json.JSONDecodeError, IOError) as e:
|
| 141 |
-
logger.warning(
|
| 142 |
-
f"Could not read package.json for {pack_name}: {e}")
|
| 143 |
|
| 144 |
if is_chunked:
|
| 145 |
# Load chunked pack
|
|
@@ -152,12 +141,10 @@ class PackLoader:
|
|
| 152 |
chunk_files = sorted(pack_dir.glob(f"{pack_name}-chunk-*.jsonl"))
|
| 153 |
|
| 154 |
if not chunk_files:
|
| 155 |
-
logger.warning(
|
| 156 |
-
f"No chunk files found for chunked pack {pack_name}")
|
| 157 |
return documents
|
| 158 |
|
| 159 |
-
logger.info(
|
| 160 |
-
f"Found {len(chunk_files)} chunk files for {pack_name}")
|
| 161 |
|
| 162 |
# Load each chunk file in order
|
| 163 |
for chunk_file in chunk_files:
|
|
@@ -168,7 +155,8 @@ class PackLoader:
|
|
| 168 |
logger.info(
|
| 169 |
f"Loaded {
|
| 170 |
len(documents)} total documents from {
|
| 171 |
-
len(chunk_files)} chunks"
|
|
|
|
| 172 |
else:
|
| 173 |
# Load single-file pack (backward compatibility)
|
| 174 |
jsonl_file = pack_dir / f"{pack_name}.jsonl"
|
|
@@ -180,8 +168,7 @@ class PackLoader:
|
|
| 180 |
|
| 181 |
return documents
|
| 182 |
|
| 183 |
-
def _load_jsonl_file(self, jsonl_file: Path,
|
| 184 |
-
pack_name: str) -> List[Dict[str, Any]]:
|
| 185 |
"""Load a single JSONL file with robust error handling"""
|
| 186 |
documents = []
|
| 187 |
error_count = 0
|
|
@@ -196,8 +183,11 @@ class PackLoader:
|
|
| 196 |
try:
|
| 197 |
entry = json.loads(line)
|
| 198 |
doc = self._format_document(
|
| 199 |
-
entry,
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
| 201 |
documents.append(doc)
|
| 202 |
except json.JSONDecodeError as e:
|
| 203 |
error_count += 1
|
|
@@ -205,7 +195,8 @@ class PackLoader:
|
|
| 205 |
if error_count <= max_errors_to_log:
|
| 206 |
logger.warning(
|
| 207 |
f"Error parsing line {line_num} in {
|
| 208 |
-
jsonl_file.name}: {e}"
|
|
|
|
| 209 |
# Continue processing other lines instead of failing
|
| 210 |
continue
|
| 211 |
|
|
@@ -213,14 +204,14 @@ class PackLoader:
|
|
| 213 |
logger.info(
|
| 214 |
f"Loaded {
|
| 215 |
len(documents)} documents from {
|
| 216 |
-
jsonl_file.name} ({error_count} lines skipped due to errors)"
|
|
|
|
| 217 |
except Exception as e:
|
| 218 |
logger.error(f"Error loading JSONL file {jsonl_file}: {e}")
|
| 219 |
|
| 220 |
return documents
|
| 221 |
|
| 222 |
-
def _load_structured_pack(self, pack_dir: Path,
|
| 223 |
-
pack_name: str) -> List[Dict[str, Any]]:
|
| 224 |
"""Load structured pack with templates"""
|
| 225 |
documents = []
|
| 226 |
|
|
@@ -233,8 +224,7 @@ class PackLoader:
|
|
| 233 |
with open(templates_file, "r", encoding="utf-8") as f:
|
| 234 |
data = json.load(f)
|
| 235 |
|
| 236 |
-
templates = data if isinstance(
|
| 237 |
-
data, list) else data.get("templates", [])
|
| 238 |
|
| 239 |
for template in templates:
|
| 240 |
doc = {
|
|
@@ -260,8 +250,7 @@ class PackLoader:
|
|
| 260 |
self, entry: Dict[str, Any], pack_name: str, doc_id: str
|
| 261 |
) -> Dict[str, Any]:
|
| 262 |
"""Format a pack entry into a document"""
|
| 263 |
-
content = entry.get("content") or entry.get(
|
| 264 |
-
"text") or json.dumps(entry)
|
| 265 |
|
| 266 |
return {
|
| 267 |
"id": f"{pack_name}/{doc_id}",
|
|
|
|
| 37 |
|
| 38 |
pack_docs = self._load_pack(pack_dir, pack_name)
|
| 39 |
documents.extend(pack_docs)
|
| 40 |
+
logger.info(f"✓ Loaded {len(pack_docs)} documents from {pack_name}")
|
|
|
|
| 41 |
|
| 42 |
self.documents = documents
|
| 43 |
return documents
|
| 44 |
|
| 45 |
+
def _load_pack(self, pack_dir: Path, pack_name: str) -> List[Dict[str, Any]]:
|
|
|
|
| 46 |
"""Load documents from a specific pack"""
|
| 47 |
documents = []
|
| 48 |
|
|
|
|
| 59 |
|
| 60 |
return documents
|
| 61 |
|
| 62 |
+
def _is_valid_warbler_pack(self, pack_dir: Path, pack_name: str, jsonl_file: Path) -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
"""Validate that a directory is a valid Warbler pack
|
| 64 |
|
| 65 |
A valid Warbler pack must have:
|
|
|
|
| 99 |
# For chunked packs, look for chunk files
|
| 100 |
chunk_files = list(pack_dir.glob(f"{pack_name}-chunk-*.jsonl"))
|
| 101 |
if chunk_files:
|
| 102 |
+
logger.debug(f"Found {len(chunk_files)} chunk files for {pack_name}")
|
|
|
|
| 103 |
return True
|
| 104 |
else:
|
| 105 |
logger.warning(f"Chunked pack {pack_name} has no chunk files")
|
|
|
|
| 109 |
if jsonl_file.exists():
|
| 110 |
return True
|
| 111 |
else:
|
| 112 |
+
logger.warning(f"Single-file pack {pack_name} missing JSONL file: {jsonl_file}")
|
|
|
|
| 113 |
return False
|
| 114 |
|
| 115 |
+
def _load_jsonl_pack(self, pack_dir: Path, pack_name: str) -> List[Dict[str, Any]]:
|
|
|
|
| 116 |
"""Load JSONL-based pack (supports both single-file and chunked packs)"""
|
| 117 |
documents = []
|
| 118 |
|
|
|
|
| 126 |
with open(package_json, "r", encoding="utf-8") as f:
|
| 127 |
metadata = json.load(f)
|
| 128 |
is_chunked = metadata.get("chunked", False)
|
| 129 |
+
chunk_pattern = metadata.get("chunk_pattern", f"{pack_name}-chunk-*.jsonl")
|
|
|
|
| 130 |
except (json.JSONDecodeError, IOError) as e:
|
| 131 |
+
logger.warning(f"Could not read package.json for {pack_name}: {e}")
|
|
|
|
| 132 |
|
| 133 |
if is_chunked:
|
| 134 |
# Load chunked pack
|
|
|
|
| 141 |
chunk_files = sorted(pack_dir.glob(f"{pack_name}-chunk-*.jsonl"))
|
| 142 |
|
| 143 |
if not chunk_files:
|
| 144 |
+
logger.warning(f"No chunk files found for chunked pack {pack_name}")
|
|
|
|
| 145 |
return documents
|
| 146 |
|
| 147 |
+
logger.info(f"Found {len(chunk_files)} chunk files for {pack_name}")
|
|
|
|
| 148 |
|
| 149 |
# Load each chunk file in order
|
| 150 |
for chunk_file in chunk_files:
|
|
|
|
| 155 |
logger.info(
|
| 156 |
f"Loaded {
|
| 157 |
len(documents)} total documents from {
|
| 158 |
+
len(chunk_files)} chunks"
|
| 159 |
+
)
|
| 160 |
else:
|
| 161 |
# Load single-file pack (backward compatibility)
|
| 162 |
jsonl_file = pack_dir / f"{pack_name}.jsonl"
|
|
|
|
| 168 |
|
| 169 |
return documents
|
| 170 |
|
| 171 |
+
def _load_jsonl_file(self, jsonl_file: Path, pack_name: str) -> List[Dict[str, Any]]:
|
|
|
|
| 172 |
"""Load a single JSONL file with robust error handling"""
|
| 173 |
documents = []
|
| 174 |
error_count = 0
|
|
|
|
| 183 |
try:
|
| 184 |
entry = json.loads(line)
|
| 185 |
doc = self._format_document(
|
| 186 |
+
entry,
|
| 187 |
+
pack_name,
|
| 188 |
+
f"{
|
| 189 |
+
jsonl_file.stem}_line_{line_num}",
|
| 190 |
+
)
|
| 191 |
documents.append(doc)
|
| 192 |
except json.JSONDecodeError as e:
|
| 193 |
error_count += 1
|
|
|
|
| 195 |
if error_count <= max_errors_to_log:
|
| 196 |
logger.warning(
|
| 197 |
f"Error parsing line {line_num} in {
|
| 198 |
+
jsonl_file.name}: {e}"
|
| 199 |
+
)
|
| 200 |
# Continue processing other lines instead of failing
|
| 201 |
continue
|
| 202 |
|
|
|
|
| 204 |
logger.info(
|
| 205 |
f"Loaded {
|
| 206 |
len(documents)} documents from {
|
| 207 |
+
jsonl_file.name} ({error_count} lines skipped due to errors)"
|
| 208 |
+
)
|
| 209 |
except Exception as e:
|
| 210 |
logger.error(f"Error loading JSONL file {jsonl_file}: {e}")
|
| 211 |
|
| 212 |
return documents
|
| 213 |
|
| 214 |
+
def _load_structured_pack(self, pack_dir: Path, pack_name: str) -> List[Dict[str, Any]]:
|
|
|
|
| 215 |
"""Load structured pack with templates"""
|
| 216 |
documents = []
|
| 217 |
|
|
|
|
| 224 |
with open(templates_file, "r", encoding="utf-8") as f:
|
| 225 |
data = json.load(f)
|
| 226 |
|
| 227 |
+
templates = data if isinstance(data, list) else data.get("templates", [])
|
|
|
|
| 228 |
|
| 229 |
for template in templates:
|
| 230 |
doc = {
|
|
|
|
| 250 |
self, entry: Dict[str, Any], pack_name: str, doc_id: str
|
| 251 |
) -> Dict[str, Any]:
|
| 252 |
"""Format a pack entry into a document"""
|
| 253 |
+
content = entry.get("content") or entry.get("text") or json.dumps(entry)
|
|
|
|
| 254 |
|
| 255 |
return {
|
| 256 |
"id": f"{pack_name}/{doc_id}",
|
warbler_cda/retrieval_api.py
CHANGED
|
@@ -31,8 +31,7 @@ class RetrievalQuery:
|
|
| 31 |
mode: RetrievalMode
|
| 32 |
anchor_ids: Optional[List[str]] = None
|
| 33 |
semantic_query: Optional[str] = None
|
| 34 |
-
temporal_range: Optional[Tuple[float, float]
|
| 35 |
-
] = None # (start_time, end_time)
|
| 36 |
max_results: int = 10
|
| 37 |
confidence_threshold: float = 0.6
|
| 38 |
exclude_conflicts: bool = True
|
|
@@ -129,17 +128,13 @@ class RetrievalAPI:
|
|
| 129 |
self.quality_threshold = self.config.get("quality_threshold", 0.6)
|
| 130 |
|
| 131 |
# STAT7 hybrid scoring configuration
|
| 132 |
-
self.enable_stat7_hybrid = self.config.get(
|
| 133 |
-
|
| 134 |
-
self.
|
| 135 |
-
"default_weight_semantic", 0.6)
|
| 136 |
-
self.default_weight_stat7 = self.config.get(
|
| 137 |
-
"default_weight_stat7", 0.4)
|
| 138 |
|
| 139 |
# Retrieval cache (for performance)
|
| 140 |
self.query_cache: Dict[str, ContextAssembly] = {}
|
| 141 |
-
self.cache_ttl_seconds = self.config.get(
|
| 142 |
-
"cache_ttl_seconds", 300) # 5 minutes
|
| 143 |
|
| 144 |
# Document STAT7 assignments cache (for rapid re-retrieval)
|
| 145 |
self.document_stat7_cache: Dict[str, Dict[str, Any]] = {}
|
|
@@ -158,8 +153,7 @@ class RetrievalAPI:
|
|
| 158 |
"quality_distribution": {"high": 0, "medium": 0, "low": 0},
|
| 159 |
}
|
| 160 |
|
| 161 |
-
def retrieve_context(
|
| 162 |
-
self, query: Union[RetrievalQuery, Dict[str, Any]]) -> ContextAssembly:
|
| 163 |
"""
|
| 164 |
Main retrieval method - assemble context based on query.
|
| 165 |
|
|
@@ -243,10 +237,7 @@ class RetrievalAPI:
|
|
| 243 |
assembly = self.retrieve_context(query)
|
| 244 |
return assembly.results
|
| 245 |
|
| 246 |
-
def get_anchor_context(
|
| 247 |
-
self,
|
| 248 |
-
anchor_id: str,
|
| 249 |
-
context_radius: int = 3) -> ContextAssembly:
|
| 250 |
"""
|
| 251 |
Get context around a specific anchor.
|
| 252 |
|
|
@@ -266,10 +257,7 @@ class RetrievalAPI:
|
|
| 266 |
|
| 267 |
return self.retrieve_context(query)
|
| 268 |
|
| 269 |
-
def trace_provenance(
|
| 270 |
-
self,
|
| 271 |
-
content_id: str,
|
| 272 |
-
max_depth: int = 5) -> ContextAssembly:
|
| 273 |
"""
|
| 274 |
Trace provenance chain for a piece of content.
|
| 275 |
|
|
@@ -327,10 +315,12 @@ class RetrievalAPI:
|
|
| 327 |
if embedding:
|
| 328 |
doc_entry["embedding"] = embedding
|
| 329 |
|
| 330 |
-
if (
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
|
|
|
|
|
|
| 334 |
|
| 335 |
if stat7_coordinates:
|
| 336 |
doc_entry["stat7_coordinates"] = stat7_coordinates
|
|
@@ -367,22 +357,17 @@ class RetrievalAPI:
|
|
| 367 |
anchor_ids=query_dict.get("anchor_ids"),
|
| 368 |
semantic_query=query_dict.get("semantic_query"),
|
| 369 |
temporal_range=query_dict.get("temporal_range"),
|
| 370 |
-
max_results=query_dict.get(
|
| 371 |
-
"max_results", self.default_max_results),
|
| 372 |
confidence_threshold=query_dict.get("confidence_threshold", 0.6),
|
| 373 |
exclude_conflicts=query_dict.get("exclude_conflicts", True),
|
| 374 |
include_provenance=query_dict.get("include_provenance", True),
|
| 375 |
-
stat7_hybrid=query_dict.get(
|
| 376 |
-
"stat7_hybrid", self.enable_stat7_hybrid),
|
| 377 |
stat7_address=query_dict.get("stat7_address"),
|
| 378 |
-
weight_semantic=query_dict.get(
|
| 379 |
-
|
| 380 |
-
weight_stat7=query_dict.get(
|
| 381 |
-
"weight_stat7", self.default_weight_stat7),
|
| 382 |
)
|
| 383 |
|
| 384 |
-
def _retrieve_semantic_similarity(
|
| 385 |
-
self, query: RetrievalQuery) -> List[RetrievalResult]:
|
| 386 |
"""Retrieve content based on semantic similarity."""
|
| 387 |
results = []
|
| 388 |
|
|
@@ -403,15 +388,13 @@ class RetrievalAPI:
|
|
| 403 |
self.semantic_anchors}",
|
| 404 |
file=sys.stderr,
|
| 405 |
)
|
| 406 |
-
print(
|
| 407 |
-
f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
|
| 408 |
|
| 409 |
# If embedding provider available, use it
|
| 410 |
if self.embedding_provider:
|
| 411 |
# Get query embedding
|
| 412 |
try:
|
| 413 |
-
query_embedding = self.embedding_provider.embed_text(
|
| 414 |
-
query.semantic_query)
|
| 415 |
except Exception:
|
| 416 |
return results
|
| 417 |
|
|
@@ -431,8 +414,8 @@ class RetrievalAPI:
|
|
| 431 |
content=anchor.concept_text,
|
| 432 |
relevance_score=similarity,
|
| 433 |
temporal_distance=self._calculate_temporal_distance(
|
| 434 |
-
anchor.provenance.first_seen,
|
| 435 |
-
|
| 436 |
anchor_connections=[anchor_id],
|
| 437 |
provenance_depth=1,
|
| 438 |
conflict_flags=[],
|
|
@@ -461,8 +444,8 @@ class RetrievalAPI:
|
|
| 461 |
content=micro.compressed_text,
|
| 462 |
relevance_score=similarity,
|
| 463 |
temporal_distance=self._calculate_temporal_distance(
|
| 464 |
-
micro.creation_timestamp,
|
| 465 |
-
|
| 466 |
anchor_connections=[],
|
| 467 |
provenance_depth=2,
|
| 468 |
conflict_flags=[],
|
|
@@ -481,9 +464,7 @@ class RetrievalAPI:
|
|
| 481 |
|
| 482 |
return results
|
| 483 |
|
| 484 |
-
def _search_context_store(
|
| 485 |
-
self,
|
| 486 |
-
query: RetrievalQuery) -> List[RetrievalResult]:
|
| 487 |
"""
|
| 488 |
Search context store using embeddings (semantic) or keyword fallback.
|
| 489 |
Prefers embedding-based semantic search when available.
|
|
@@ -494,16 +475,14 @@ class RetrievalAPI:
|
|
| 494 |
return results
|
| 495 |
|
| 496 |
try:
|
| 497 |
-
if self.embedding_provider and hasattr(
|
| 498 |
-
self.embedding_provider, "semantic_search"):
|
| 499 |
return self._search_context_store_semantic(query)
|
| 500 |
except Exception as e:
|
| 501 |
pass
|
| 502 |
|
| 503 |
return self._search_context_store_keyword(query)
|
| 504 |
|
| 505 |
-
def _search_context_store_semantic(
|
| 506 |
-
self, query: RetrievalQuery) -> List[RetrievalResult]:
|
| 507 |
"""Search context store using semantic embeddings."""
|
| 508 |
results = []
|
| 509 |
|
|
@@ -540,10 +519,8 @@ class RetrievalAPI:
|
|
| 540 |
hybrid_score = sim_score
|
| 541 |
if query.stat7_hybrid:
|
| 542 |
hybrid_score = (
|
| 543 |
-
query.weight_semantic *
|
| 544 |
-
|
| 545 |
-
query.weight_stat7 *
|
| 546 |
-
stat7_resonance)
|
| 547 |
|
| 548 |
result = RetrievalResult(
|
| 549 |
result_id=f"ctx_{doc_id}",
|
|
@@ -565,8 +542,7 @@ class RetrievalAPI:
|
|
| 565 |
|
| 566 |
return results
|
| 567 |
|
| 568 |
-
def _search_context_store_keyword(
|
| 569 |
-
self, query: RetrievalQuery) -> List[RetrievalResult]:
|
| 570 |
"""Fallback keyword-based search of context store."""
|
| 571 |
results = []
|
| 572 |
|
|
@@ -613,21 +589,16 @@ class RetrievalAPI:
|
|
| 613 |
return 0.5
|
| 614 |
|
| 615 |
try:
|
| 616 |
-
lineage_dist = abs(doc_stat7.get("lineage", 0.5) -
|
| 617 |
-
query_stat7.get("lineage", 0.5))
|
| 618 |
adjacency_dist = abs(
|
| 619 |
-
doc_stat7.get("adjacency", 0.5) -
|
| 620 |
-
query_stat7.get("adjacency", 0.5)
|
| 621 |
)
|
| 622 |
luminosity_dist = abs(
|
| 623 |
-
doc_stat7.get("luminosity", 0.7) -
|
| 624 |
-
query_stat7.get("luminosity", 0.7)
|
| 625 |
)
|
| 626 |
-
polarity_dist = abs(doc_stat7.get(
|
| 627 |
-
"polarity", 0.5) - query_stat7.get("polarity", 0.5))
|
| 628 |
dimensionality_dist = abs(
|
| 629 |
-
doc_stat7.get("dimensionality", 0.5) -
|
| 630 |
-
query_stat7.get("dimensionality", 0.5)
|
| 631 |
)
|
| 632 |
|
| 633 |
avg_distance = (
|
|
@@ -643,8 +614,7 @@ class RetrievalAPI:
|
|
| 643 |
except Exception:
|
| 644 |
return 0.5
|
| 645 |
|
| 646 |
-
def _retrieve_temporal_sequence(
|
| 647 |
-
self, query: RetrievalQuery) -> List[RetrievalResult]:
|
| 648 |
"""Retrieve content based on temporal sequence."""
|
| 649 |
results = []
|
| 650 |
|
|
@@ -664,15 +634,15 @@ class RetrievalAPI:
|
|
| 664 |
for anchor_id, anchor in self.semantic_anchors.anchors.items():
|
| 665 |
if temporal_range[0] <= anchor.provenance.first_seen <= temporal_range[1]:
|
| 666 |
temporal_items.append(
|
| 667 |
-
("anchor", anchor_id, anchor.provenance.first_seen, anchor)
|
|
|
|
| 668 |
|
| 669 |
# Add micro-summaries
|
| 670 |
if self.summarization_ladder:
|
| 671 |
for micro in self.summarization_ladder.micro_summaries:
|
| 672 |
if temporal_range[0] <= micro.creation_timestamp <= temporal_range[1]:
|
| 673 |
temporal_items.append(
|
| 674 |
-
("micro_summary", micro.summary_id,
|
| 675 |
-
micro.creation_timestamp, micro)
|
| 676 |
)
|
| 677 |
|
| 678 |
# Sort by timestamp
|
|
@@ -711,15 +681,13 @@ class RetrievalAPI:
|
|
| 711 |
anchor_connections=[],
|
| 712 |
provenance_depth=2,
|
| 713 |
conflict_flags=[],
|
| 714 |
-
metadata={"timestamp": timestamp,
|
| 715 |
-
"window_size": micro.window_size},
|
| 716 |
)
|
| 717 |
results.append(result)
|
| 718 |
|
| 719 |
return results
|
| 720 |
|
| 721 |
-
def _retrieve_anchor_neighborhood(
|
| 722 |
-
self, query: RetrievalQuery) -> List[RetrievalResult]:
|
| 723 |
"""Retrieve content in the neighborhood of specific anchors."""
|
| 724 |
results = []
|
| 725 |
|
|
@@ -756,15 +724,13 @@ class RetrievalAPI:
|
|
| 756 |
anchor_connections=[anchor_id, other_id],
|
| 757 |
provenance_depth=1,
|
| 758 |
conflict_flags=[],
|
| 759 |
-
metadata={"neighbor_of": anchor_id,
|
| 760 |
-
"similarity": similarity},
|
| 761 |
)
|
| 762 |
results.append(result)
|
| 763 |
|
| 764 |
return results
|
| 765 |
|
| 766 |
-
def _retrieve_provenance_chain(
|
| 767 |
-
self, query: RetrievalQuery) -> List[RetrievalResult]:
|
| 768 |
"""Retrieve content following provenance relationships."""
|
| 769 |
results = []
|
| 770 |
|
|
@@ -795,8 +761,7 @@ class RetrievalAPI:
|
|
| 795 |
results.append(result)
|
| 796 |
|
| 797 |
# Add related content from update history
|
| 798 |
-
for i, update in enumerate(
|
| 799 |
-
anchor.provenance.update_history):
|
| 800 |
if i >= query.max_results - 1:
|
| 801 |
break
|
| 802 |
|
|
@@ -806,20 +771,17 @@ class RetrievalAPI:
|
|
| 806 |
content_id=f"{anchor_id}_update_{i}",
|
| 807 |
content=f"Update: {update.get('context', {}).get('mist_id', 'unknown')}",
|
| 808 |
relevance_score=0.8 - (i * 0.1),
|
| 809 |
-
temporal_distance=abs(
|
| 810 |
-
update["timestamp"] - query.query_timestamp),
|
| 811 |
anchor_connections=[anchor_id],
|
| 812 |
provenance_depth=i + 1,
|
| 813 |
conflict_flags=[],
|
| 814 |
-
metadata={
|
| 815 |
-
"update_context": update.get("context", {})},
|
| 816 |
)
|
| 817 |
results.append(result)
|
| 818 |
|
| 819 |
return results
|
| 820 |
|
| 821 |
-
def _retrieve_conflict_aware(
|
| 822 |
-
self, query: RetrievalQuery) -> List[RetrievalResult]:
|
| 823 |
"""Retrieve content while avoiding conflicts."""
|
| 824 |
# First get base results
|
| 825 |
base_results = self._retrieve_semantic_similarity(query)
|
|
@@ -835,14 +797,14 @@ class RetrievalAPI:
|
|
| 835 |
|
| 836 |
# Check for conflicts involving this content
|
| 837 |
if hasattr(self.conflict_detector, "get_conflict_analysis"):
|
| 838 |
-
conflict_analysis = self.conflict_detector.get_conflict_analysis(
|
| 839 |
-
result.content_id)
|
| 840 |
if conflict_analysis.get("conflicts_found", 0) > 0:
|
| 841 |
conflicts = [
|
| 842 |
f"conflict_confidence_{
|
| 843 |
conflict_analysis.get(
|
| 844 |
'max_confidence',
|
| 845 |
-
0):.2f}"
|
|
|
|
| 846 |
|
| 847 |
# Include result but flag conflicts
|
| 848 |
result.conflict_flags = conflicts
|
|
@@ -851,9 +813,7 @@ class RetrievalAPI:
|
|
| 851 |
|
| 852 |
return filtered_results
|
| 853 |
|
| 854 |
-
def _retrieve_composite(
|
| 855 |
-
self,
|
| 856 |
-
query: RetrievalQuery) -> List[RetrievalResult]:
|
| 857 |
"""Retrieve using multiple modes and combine results."""
|
| 858 |
all_results = []
|
| 859 |
|
|
@@ -896,15 +856,13 @@ class RetrievalAPI:
|
|
| 896 |
self.metrics["hybrid_queries"] += 1
|
| 897 |
|
| 898 |
# Filter by confidence threshold
|
| 899 |
-
filtered = [r for r in results if r.relevance_score >=
|
| 900 |
-
query.confidence_threshold]
|
| 901 |
|
| 902 |
# Apply temporal decay
|
| 903 |
current_time = query.query_timestamp
|
| 904 |
for result in filtered:
|
| 905 |
age_hours = result.temporal_distance / 3600
|
| 906 |
-
decay_factor = max(
|
| 907 |
-
0.1, 1.0 - (age_hours / self.temporal_decay_hours))
|
| 908 |
result.relevance_score *= decay_factor
|
| 909 |
|
| 910 |
# Sort by relevance score
|
|
@@ -937,8 +895,7 @@ class RetrievalAPI:
|
|
| 937 |
# Temporal span
|
| 938 |
timestamps = [r.temporal_distance for r in results]
|
| 939 |
temporal_span_hours = (
|
| 940 |
-
(max(timestamps) - min(timestamps)) /
|
| 941 |
-
3600 if len(timestamps) > 1 else 0
|
| 942 |
)
|
| 943 |
|
| 944 |
# Anchor coverage
|
|
@@ -968,17 +925,11 @@ class RetrievalAPI:
|
|
| 968 |
retrieval_timestamp=time.time(),
|
| 969 |
)
|
| 970 |
|
| 971 |
-
def _calculate_temporal_distance(
|
| 972 |
-
self,
|
| 973 |
-
timestamp: float,
|
| 974 |
-
reference_time: float) -> float:
|
| 975 |
"""Calculate temporal distance between two timestamps."""
|
| 976 |
return abs(timestamp - reference_time)
|
| 977 |
|
| 978 |
-
def _calculate_temporal_relevance(
|
| 979 |
-
self,
|
| 980 |
-
timestamp: float,
|
| 981 |
-
reference_time: float) -> float:
|
| 982 |
"""Calculate relevance based on temporal proximity."""
|
| 983 |
distance_seconds = abs(timestamp - reference_time)
|
| 984 |
distance_hours = distance_seconds / 3600
|
|
@@ -1060,14 +1011,12 @@ class RetrievalAPI:
|
|
| 1060 |
def _update_metrics(self, assembly: ContextAssembly, elapsed_ms: float):
|
| 1061 |
"""Update performance metrics."""
|
| 1062 |
self.metrics["average_results_per_query"] = (
|
| 1063 |
-
self.metrics["average_results_per_query"] *
|
| 1064 |
-
(self.metrics["total_queries"] - 1)
|
| 1065 |
+ len(assembly.results)
|
| 1066 |
) / self.metrics["total_queries"]
|
| 1067 |
|
| 1068 |
self.metrics["average_retrieval_time_ms"] = (
|
| 1069 |
-
self.metrics["average_retrieval_time_ms"] *
|
| 1070 |
-
(self.metrics["total_queries"] - 1)
|
| 1071 |
+ elapsed_ms
|
| 1072 |
) / self.metrics["total_queries"]
|
| 1073 |
|
|
@@ -1081,8 +1030,7 @@ class RetrievalAPI:
|
|
| 1081 |
|
| 1082 |
def _calculate_cache_hit_rate(self) -> float:
|
| 1083 |
"""Calculate cache hit rate."""
|
| 1084 |
-
total_requests = self.metrics["cache_hits"] +
|
| 1085 |
-
self.metrics["cache_misses"]
|
| 1086 |
if total_requests == 0:
|
| 1087 |
return 0.0
|
| 1088 |
return self.metrics["cache_hits"] / total_requests
|
|
@@ -1228,8 +1176,8 @@ class RetrievalAPI:
|
|
| 1228 |
try:
|
| 1229 |
q_stat7_dict = query.stat7_address
|
| 1230 |
query_realm = Realm(
|
| 1231 |
-
type=q_stat7_dict["realm"]["type"],
|
| 1232 |
-
|
| 1233 |
query_stat7 = STAT7Address(
|
| 1234 |
realm=query_realm,
|
| 1235 |
lineage=q_stat7_dict["lineage"],
|
|
@@ -1259,8 +1207,8 @@ class RetrievalAPI:
|
|
| 1259 |
|
| 1260 |
try:
|
| 1261 |
doc_realm = Realm(
|
| 1262 |
-
type=doc_stat7_dict["realm"]["type"],
|
| 1263 |
-
|
| 1264 |
doc_stat7 = STAT7Address(
|
| 1265 |
realm=doc_realm,
|
| 1266 |
lineage=doc_stat7_dict["lineage"],
|
|
@@ -1275,8 +1223,7 @@ class RetrievalAPI:
|
|
| 1275 |
continue
|
| 1276 |
|
| 1277 |
# Compute STAT7 resonance score
|
| 1278 |
-
stat7_res = self.stat7_bridge.stat7_resonance(
|
| 1279 |
-
query_stat7, doc_stat7)
|
| 1280 |
result.stat7_resonance = stat7_res
|
| 1281 |
|
| 1282 |
# Compute semantic similarity (if available)
|
|
@@ -1284,8 +1231,7 @@ class RetrievalAPI:
|
|
| 1284 |
result.semantic_similarity = semantic_sim
|
| 1285 |
|
| 1286 |
# Combine into hybrid score
|
| 1287 |
-
hybrid = (query.weight_semantic * semantic_sim) +
|
| 1288 |
-
(query.weight_stat7 * stat7_res)
|
| 1289 |
result.relevance_score = max(0.0, min(hybrid, 1.0))
|
| 1290 |
|
| 1291 |
return results
|
|
|
|
| 31 |
mode: RetrievalMode
|
| 32 |
anchor_ids: Optional[List[str]] = None
|
| 33 |
semantic_query: Optional[str] = None
|
| 34 |
+
temporal_range: Optional[Tuple[float, float]] = None # (start_time, end_time)
|
|
|
|
| 35 |
max_results: int = 10
|
| 36 |
confidence_threshold: float = 0.6
|
| 37 |
exclude_conflicts: bool = True
|
|
|
|
| 128 |
self.quality_threshold = self.config.get("quality_threshold", 0.6)
|
| 129 |
|
| 130 |
# STAT7 hybrid scoring configuration
|
| 131 |
+
self.enable_stat7_hybrid = self.config.get("enable_stat7_hybrid", False)
|
| 132 |
+
self.default_weight_semantic = self.config.get("default_weight_semantic", 0.6)
|
| 133 |
+
self.default_weight_stat7 = self.config.get("default_weight_stat7", 0.4)
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
# Retrieval cache (for performance)
|
| 136 |
self.query_cache: Dict[str, ContextAssembly] = {}
|
| 137 |
+
self.cache_ttl_seconds = self.config.get("cache_ttl_seconds", 300) # 5 minutes
|
|
|
|
| 138 |
|
| 139 |
# Document STAT7 assignments cache (for rapid re-retrieval)
|
| 140 |
self.document_stat7_cache: Dict[str, Dict[str, Any]] = {}
|
|
|
|
| 153 |
"quality_distribution": {"high": 0, "medium": 0, "low": 0},
|
| 154 |
}
|
| 155 |
|
| 156 |
+
def retrieve_context(self, query: Union[RetrievalQuery, Dict[str, Any]]) -> ContextAssembly:
|
|
|
|
| 157 |
"""
|
| 158 |
Main retrieval method - assemble context based on query.
|
| 159 |
|
|
|
|
| 237 |
assembly = self.retrieve_context(query)
|
| 238 |
return assembly.results
|
| 239 |
|
| 240 |
+
def get_anchor_context(self, anchor_id: str, context_radius: int = 3) -> ContextAssembly:
|
|
|
|
|
|
|
|
|
|
| 241 |
"""
|
| 242 |
Get context around a specific anchor.
|
| 243 |
|
|
|
|
| 257 |
|
| 258 |
return self.retrieve_context(query)
|
| 259 |
|
| 260 |
+
def trace_provenance(self, content_id: str, max_depth: int = 5) -> ContextAssembly:
|
|
|
|
|
|
|
|
|
|
| 261 |
"""
|
| 262 |
Trace provenance chain for a piece of content.
|
| 263 |
|
|
|
|
| 315 |
if embedding:
|
| 316 |
doc_entry["embedding"] = embedding
|
| 317 |
|
| 318 |
+
if (
|
| 319 |
+
stat7_coordinates is None
|
| 320 |
+
and embedding
|
| 321 |
+
and hasattr(self.embedding_provider, "compute_stat7_from_embedding")
|
| 322 |
+
):
|
| 323 |
+
stat7_coordinates = self.embedding_provider.compute_stat7_from_embedding(embedding)
|
| 324 |
|
| 325 |
if stat7_coordinates:
|
| 326 |
doc_entry["stat7_coordinates"] = stat7_coordinates
|
|
|
|
| 357 |
anchor_ids=query_dict.get("anchor_ids"),
|
| 358 |
semantic_query=query_dict.get("semantic_query"),
|
| 359 |
temporal_range=query_dict.get("temporal_range"),
|
| 360 |
+
max_results=query_dict.get("max_results", self.default_max_results),
|
|
|
|
| 361 |
confidence_threshold=query_dict.get("confidence_threshold", 0.6),
|
| 362 |
exclude_conflicts=query_dict.get("exclude_conflicts", True),
|
| 363 |
include_provenance=query_dict.get("include_provenance", True),
|
| 364 |
+
stat7_hybrid=query_dict.get("stat7_hybrid", self.enable_stat7_hybrid),
|
|
|
|
| 365 |
stat7_address=query_dict.get("stat7_address"),
|
| 366 |
+
weight_semantic=query_dict.get("weight_semantic", self.default_weight_semantic),
|
| 367 |
+
weight_stat7=query_dict.get("weight_stat7", self.default_weight_stat7),
|
|
|
|
|
|
|
| 368 |
)
|
| 369 |
|
| 370 |
+
def _retrieve_semantic_similarity(self, query: RetrievalQuery) -> List[RetrievalResult]:
|
|
|
|
| 371 |
"""Retrieve content based on semantic similarity."""
|
| 372 |
results = []
|
| 373 |
|
|
|
|
| 388 |
self.semantic_anchors}",
|
| 389 |
file=sys.stderr,
|
| 390 |
)
|
| 391 |
+
print(f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
|
|
|
|
| 392 |
|
| 393 |
# If embedding provider available, use it
|
| 394 |
if self.embedding_provider:
|
| 395 |
# Get query embedding
|
| 396 |
try:
|
| 397 |
+
query_embedding = self.embedding_provider.embed_text(query.semantic_query)
|
|
|
|
| 398 |
except Exception:
|
| 399 |
return results
|
| 400 |
|
|
|
|
| 414 |
content=anchor.concept_text,
|
| 415 |
relevance_score=similarity,
|
| 416 |
temporal_distance=self._calculate_temporal_distance(
|
| 417 |
+
anchor.provenance.first_seen, query.query_timestamp
|
| 418 |
+
),
|
| 419 |
anchor_connections=[anchor_id],
|
| 420 |
provenance_depth=1,
|
| 421 |
conflict_flags=[],
|
|
|
|
| 444 |
content=micro.compressed_text,
|
| 445 |
relevance_score=similarity,
|
| 446 |
temporal_distance=self._calculate_temporal_distance(
|
| 447 |
+
micro.creation_timestamp, query.query_timestamp
|
| 448 |
+
),
|
| 449 |
anchor_connections=[],
|
| 450 |
provenance_depth=2,
|
| 451 |
conflict_flags=[],
|
|
|
|
| 464 |
|
| 465 |
return results
|
| 466 |
|
| 467 |
+
def _search_context_store(self, query: RetrievalQuery) -> List[RetrievalResult]:
|
|
|
|
|
|
|
| 468 |
"""
|
| 469 |
Search context store using embeddings (semantic) or keyword fallback.
|
| 470 |
Prefers embedding-based semantic search when available.
|
|
|
|
| 475 |
return results
|
| 476 |
|
| 477 |
try:
|
| 478 |
+
if self.embedding_provider and hasattr(self.embedding_provider, "semantic_search"):
|
|
|
|
| 479 |
return self._search_context_store_semantic(query)
|
| 480 |
except Exception as e:
|
| 481 |
pass
|
| 482 |
|
| 483 |
return self._search_context_store_keyword(query)
|
| 484 |
|
| 485 |
+
def _search_context_store_semantic(self, query: RetrievalQuery) -> List[RetrievalResult]:
|
|
|
|
| 486 |
"""Search context store using semantic embeddings."""
|
| 487 |
results = []
|
| 488 |
|
|
|
|
| 519 |
hybrid_score = sim_score
|
| 520 |
if query.stat7_hybrid:
|
| 521 |
hybrid_score = (
|
| 522 |
+
query.weight_semantic * sim_score + query.weight_stat7 * stat7_resonance
|
| 523 |
+
)
|
|
|
|
|
|
|
| 524 |
|
| 525 |
result = RetrievalResult(
|
| 526 |
result_id=f"ctx_{doc_id}",
|
|
|
|
| 542 |
|
| 543 |
return results
|
| 544 |
|
| 545 |
+
def _search_context_store_keyword(self, query: RetrievalQuery) -> List[RetrievalResult]:
|
|
|
|
| 546 |
"""Fallback keyword-based search of context store."""
|
| 547 |
results = []
|
| 548 |
|
|
|
|
| 589 |
return 0.5
|
| 590 |
|
| 591 |
try:
|
| 592 |
+
lineage_dist = abs(doc_stat7.get("lineage", 0.5) - query_stat7.get("lineage", 0.5))
|
|
|
|
| 593 |
adjacency_dist = abs(
|
| 594 |
+
doc_stat7.get("adjacency", 0.5) - query_stat7.get("adjacency", 0.5)
|
|
|
|
| 595 |
)
|
| 596 |
luminosity_dist = abs(
|
| 597 |
+
doc_stat7.get("luminosity", 0.7) - query_stat7.get("luminosity", 0.7)
|
|
|
|
| 598 |
)
|
| 599 |
+
polarity_dist = abs(doc_stat7.get("polarity", 0.5) - query_stat7.get("polarity", 0.5))
|
|
|
|
| 600 |
dimensionality_dist = abs(
|
| 601 |
+
doc_stat7.get("dimensionality", 0.5) - query_stat7.get("dimensionality", 0.5)
|
|
|
|
| 602 |
)
|
| 603 |
|
| 604 |
avg_distance = (
|
|
|
|
| 614 |
except Exception:
|
| 615 |
return 0.5
|
| 616 |
|
| 617 |
+
def _retrieve_temporal_sequence(self, query: RetrievalQuery) -> List[RetrievalResult]:
|
|
|
|
| 618 |
"""Retrieve content based on temporal sequence."""
|
| 619 |
results = []
|
| 620 |
|
|
|
|
| 634 |
for anchor_id, anchor in self.semantic_anchors.anchors.items():
|
| 635 |
if temporal_range[0] <= anchor.provenance.first_seen <= temporal_range[1]:
|
| 636 |
temporal_items.append(
|
| 637 |
+
("anchor", anchor_id, anchor.provenance.first_seen, anchor)
|
| 638 |
+
)
|
| 639 |
|
| 640 |
# Add micro-summaries
|
| 641 |
if self.summarization_ladder:
|
| 642 |
for micro in self.summarization_ladder.micro_summaries:
|
| 643 |
if temporal_range[0] <= micro.creation_timestamp <= temporal_range[1]:
|
| 644 |
temporal_items.append(
|
| 645 |
+
("micro_summary", micro.summary_id, micro.creation_timestamp, micro)
|
|
|
|
| 646 |
)
|
| 647 |
|
| 648 |
# Sort by timestamp
|
|
|
|
| 681 |
anchor_connections=[],
|
| 682 |
provenance_depth=2,
|
| 683 |
conflict_flags=[],
|
| 684 |
+
metadata={"timestamp": timestamp, "window_size": micro.window_size},
|
|
|
|
| 685 |
)
|
| 686 |
results.append(result)
|
| 687 |
|
| 688 |
return results
|
| 689 |
|
| 690 |
+
def _retrieve_anchor_neighborhood(self, query: RetrievalQuery) -> List[RetrievalResult]:
|
|
|
|
| 691 |
"""Retrieve content in the neighborhood of specific anchors."""
|
| 692 |
results = []
|
| 693 |
|
|
|
|
| 724 |
anchor_connections=[anchor_id, other_id],
|
| 725 |
provenance_depth=1,
|
| 726 |
conflict_flags=[],
|
| 727 |
+
metadata={"neighbor_of": anchor_id, "similarity": similarity},
|
|
|
|
| 728 |
)
|
| 729 |
results.append(result)
|
| 730 |
|
| 731 |
return results
|
| 732 |
|
| 733 |
+
def _retrieve_provenance_chain(self, query: RetrievalQuery) -> List[RetrievalResult]:
|
|
|
|
| 734 |
"""Retrieve content following provenance relationships."""
|
| 735 |
results = []
|
| 736 |
|
|
|
|
| 761 |
results.append(result)
|
| 762 |
|
| 763 |
# Add related content from update history
|
| 764 |
+
for i, update in enumerate(anchor.provenance.update_history):
|
|
|
|
| 765 |
if i >= query.max_results - 1:
|
| 766 |
break
|
| 767 |
|
|
|
|
| 771 |
content_id=f"{anchor_id}_update_{i}",
|
| 772 |
content=f"Update: {update.get('context', {}).get('mist_id', 'unknown')}",
|
| 773 |
relevance_score=0.8 - (i * 0.1),
|
| 774 |
+
temporal_distance=abs(update["timestamp"] - query.query_timestamp),
|
|
|
|
| 775 |
anchor_connections=[anchor_id],
|
| 776 |
provenance_depth=i + 1,
|
| 777 |
conflict_flags=[],
|
| 778 |
+
metadata={"update_context": update.get("context", {})},
|
|
|
|
| 779 |
)
|
| 780 |
results.append(result)
|
| 781 |
|
| 782 |
return results
|
| 783 |
|
| 784 |
+
def _retrieve_conflict_aware(self, query: RetrievalQuery) -> List[RetrievalResult]:
|
|
|
|
| 785 |
"""Retrieve content while avoiding conflicts."""
|
| 786 |
# First get base results
|
| 787 |
base_results = self._retrieve_semantic_similarity(query)
|
|
|
|
| 797 |
|
| 798 |
# Check for conflicts involving this content
|
| 799 |
if hasattr(self.conflict_detector, "get_conflict_analysis"):
|
| 800 |
+
conflict_analysis = self.conflict_detector.get_conflict_analysis(result.content_id)
|
|
|
|
| 801 |
if conflict_analysis.get("conflicts_found", 0) > 0:
|
| 802 |
conflicts = [
|
| 803 |
f"conflict_confidence_{
|
| 804 |
conflict_analysis.get(
|
| 805 |
'max_confidence',
|
| 806 |
+
0):.2f}"
|
| 807 |
+
]
|
| 808 |
|
| 809 |
# Include result but flag conflicts
|
| 810 |
result.conflict_flags = conflicts
|
|
|
|
| 813 |
|
| 814 |
return filtered_results
|
| 815 |
|
| 816 |
+
def _retrieve_composite(self, query: RetrievalQuery) -> List[RetrievalResult]:
|
|
|
|
|
|
|
| 817 |
"""Retrieve using multiple modes and combine results."""
|
| 818 |
all_results = []
|
| 819 |
|
|
|
|
| 856 |
self.metrics["hybrid_queries"] += 1
|
| 857 |
|
| 858 |
# Filter by confidence threshold
|
| 859 |
+
filtered = [r for r in results if r.relevance_score >= query.confidence_threshold]
|
|
|
|
| 860 |
|
| 861 |
# Apply temporal decay
|
| 862 |
current_time = query.query_timestamp
|
| 863 |
for result in filtered:
|
| 864 |
age_hours = result.temporal_distance / 3600
|
| 865 |
+
decay_factor = max(0.1, 1.0 - (age_hours / self.temporal_decay_hours))
|
|
|
|
| 866 |
result.relevance_score *= decay_factor
|
| 867 |
|
| 868 |
# Sort by relevance score
|
|
|
|
| 895 |
# Temporal span
|
| 896 |
timestamps = [r.temporal_distance for r in results]
|
| 897 |
temporal_span_hours = (
|
| 898 |
+
(max(timestamps) - min(timestamps)) / 3600 if len(timestamps) > 1 else 0
|
|
|
|
| 899 |
)
|
| 900 |
|
| 901 |
# Anchor coverage
|
|
|
|
| 925 |
retrieval_timestamp=time.time(),
|
| 926 |
)
|
| 927 |
|
| 928 |
+
def _calculate_temporal_distance(self, timestamp: float, reference_time: float) -> float:
|
|
|
|
|
|
|
|
|
|
| 929 |
"""Calculate temporal distance between two timestamps."""
|
| 930 |
return abs(timestamp - reference_time)
|
| 931 |
|
| 932 |
+
def _calculate_temporal_relevance(self, timestamp: float, reference_time: float) -> float:
|
|
|
|
|
|
|
|
|
|
| 933 |
"""Calculate relevance based on temporal proximity."""
|
| 934 |
distance_seconds = abs(timestamp - reference_time)
|
| 935 |
distance_hours = distance_seconds / 3600
|
|
|
|
| 1011 |
def _update_metrics(self, assembly: ContextAssembly, elapsed_ms: float):
|
| 1012 |
"""Update performance metrics."""
|
| 1013 |
self.metrics["average_results_per_query"] = (
|
| 1014 |
+
self.metrics["average_results_per_query"] * (self.metrics["total_queries"] - 1)
|
|
|
|
| 1015 |
+ len(assembly.results)
|
| 1016 |
) / self.metrics["total_queries"]
|
| 1017 |
|
| 1018 |
self.metrics["average_retrieval_time_ms"] = (
|
| 1019 |
+
self.metrics["average_retrieval_time_ms"] * (self.metrics["total_queries"] - 1)
|
|
|
|
| 1020 |
+ elapsed_ms
|
| 1021 |
) / self.metrics["total_queries"]
|
| 1022 |
|
|
|
|
| 1030 |
|
| 1031 |
def _calculate_cache_hit_rate(self) -> float:
|
| 1032 |
"""Calculate cache hit rate."""
|
| 1033 |
+
total_requests = self.metrics["cache_hits"] + self.metrics["cache_misses"]
|
|
|
|
| 1034 |
if total_requests == 0:
|
| 1035 |
return 0.0
|
| 1036 |
return self.metrics["cache_hits"] / total_requests
|
|
|
|
| 1176 |
try:
|
| 1177 |
q_stat7_dict = query.stat7_address
|
| 1178 |
query_realm = Realm(
|
| 1179 |
+
type=q_stat7_dict["realm"]["type"], label=q_stat7_dict["realm"]["label"]
|
| 1180 |
+
)
|
| 1181 |
query_stat7 = STAT7Address(
|
| 1182 |
realm=query_realm,
|
| 1183 |
lineage=q_stat7_dict["lineage"],
|
|
|
|
| 1207 |
|
| 1208 |
try:
|
| 1209 |
doc_realm = Realm(
|
| 1210 |
+
type=doc_stat7_dict["realm"]["type"], label=doc_stat7_dict["realm"]["label"]
|
| 1211 |
+
)
|
| 1212 |
doc_stat7 = STAT7Address(
|
| 1213 |
realm=doc_realm,
|
| 1214 |
lineage=doc_stat7_dict["lineage"],
|
|
|
|
| 1223 |
continue
|
| 1224 |
|
| 1225 |
# Compute STAT7 resonance score
|
| 1226 |
+
stat7_res = self.stat7_bridge.stat7_resonance(query_stat7, doc_stat7)
|
|
|
|
| 1227 |
result.stat7_resonance = stat7_res
|
| 1228 |
|
| 1229 |
# Compute semantic similarity (if available)
|
|
|
|
| 1231 |
result.semantic_similarity = semantic_sim
|
| 1232 |
|
| 1233 |
# Combine into hybrid score
|
| 1234 |
+
hybrid = (query.weight_semantic * semantic_sim) + (query.weight_stat7 * stat7_res)
|
|
|
|
| 1235 |
result.relevance_score = max(0.0, min(hybrid, 1.0))
|
| 1236 |
|
| 1237 |
return results
|
warbler_cda/stat7_entity.py
CHANGED
|
@@ -251,8 +251,7 @@ class STAT7Entity(ABC):
|
|
| 251 |
# Event Tracking
|
| 252 |
# ========================================================================
|
| 253 |
|
| 254 |
-
def _record_event(self, event_type: str, description: str,
|
| 255 |
-
metadata: Dict[str, Any] = None):
|
| 256 |
"""Record a lifecycle event"""
|
| 257 |
event = LifecycleEvent(
|
| 258 |
timestamp=datetime.utcnow(),
|
|
@@ -263,12 +262,9 @@ class STAT7Entity(ABC):
|
|
| 263 |
self.lifecycle_events.append(event)
|
| 264 |
self.last_activity = event.timestamp
|
| 265 |
|
| 266 |
-
def get_event_history(
|
| 267 |
-
self,
|
| 268 |
-
limit: Optional[int] = None) -> List[LifecycleEvent]:
|
| 269 |
"""Get lifecycle events, optionally limited to most recent"""
|
| 270 |
-
events = sorted(self.lifecycle_events,
|
| 271 |
-
key=lambda e: e.timestamp, reverse=True)
|
| 272 |
return events[:limit] if limit else events
|
| 273 |
|
| 274 |
# ========================================================================
|
|
@@ -284,8 +280,7 @@ class STAT7Entity(ABC):
|
|
| 284 |
self.entangled_entities.append(other_entity_id)
|
| 285 |
self.entanglement_strength.append(strength)
|
| 286 |
self._record_event(
|
| 287 |
-
"entanglement_added", f"Entangled with {other_entity_id}", {
|
| 288 |
-
"strength": strength}
|
| 289 |
)
|
| 290 |
|
| 291 |
def remove_entanglement(self, other_entity_id: str):
|
|
@@ -294,17 +289,13 @@ class STAT7Entity(ABC):
|
|
| 294 |
idx = self.entangled_entities.index(other_entity_id)
|
| 295 |
self.entangled_entities.pop(idx)
|
| 296 |
self.entanglement_strength.pop(idx)
|
| 297 |
-
self._record_event("entanglement_removed",
|
| 298 |
-
f"Untangled from {other_entity_id}")
|
| 299 |
|
| 300 |
def get_entanglements(self) -> List[Tuple[str, float]]:
|
| 301 |
"""Get all entangled entities with strength"""
|
| 302 |
return list(zip(self.entangled_entities, self.entanglement_strength))
|
| 303 |
|
| 304 |
-
def update_entanglement_strength(
|
| 305 |
-
self,
|
| 306 |
-
other_entity_id: str,
|
| 307 |
-
new_strength: float):
|
| 308 |
"""Update entanglement strength with another entity"""
|
| 309 |
if other_entity_id in self.entangled_entities:
|
| 310 |
idx = self.entangled_entities.index(other_entity_id)
|
|
@@ -365,8 +356,7 @@ class STAT7Entity(ABC):
|
|
| 365 |
{"trait_type": "Realm", "value": self.stat7.realm.value},
|
| 366 |
{"trait_type": "Lineage", "value": self.stat7.lineage},
|
| 367 |
{"trait_type": "Horizon", "value": self.stat7.horizon.value},
|
| 368 |
-
{"trait_type": "Luminosity",
|
| 369 |
-
"value": int(self.stat7.luminosity)},
|
| 370 |
{"trait_type": "Polarity", "value": self.stat7.polarity.value},
|
| 371 |
{"trait_type": "Dimensionality", "value": self.stat7.dimensionality},
|
| 372 |
{"trait_type": "STAT7 Address", "value": self.stat7.address},
|
|
@@ -374,11 +364,7 @@ class STAT7Entity(ABC):
|
|
| 374 |
"properties": card_data.get("properties", {}),
|
| 375 |
}
|
| 376 |
|
| 377 |
-
def record_mint(
|
| 378 |
-
self,
|
| 379 |
-
contract_address: str,
|
| 380 |
-
token_id: int,
|
| 381 |
-
ipfs_hash: str):
|
| 382 |
"""Record successful NFT minting"""
|
| 383 |
self.nft_minted = True
|
| 384 |
self.nft_contract = contract_address
|
|
@@ -387,8 +373,7 @@ class STAT7Entity(ABC):
|
|
| 387 |
self._record_event(
|
| 388 |
"nft_minted",
|
| 389 |
f"Minted as ERC-721 token #{token_id}",
|
| 390 |
-
{"contract": contract_address,
|
| 391 |
-
"token_id": token_id, "ipfs_hash": ipfs_hash},
|
| 392 |
)
|
| 393 |
|
| 394 |
# ========================================================================
|
|
@@ -501,8 +486,7 @@ class STAT7Entity(ABC):
|
|
| 501 |
"type": "entity_profile",
|
| 502 |
**card_data,
|
| 503 |
"owner": self.owner_id,
|
| 504 |
-
"lifecycle_events": [
|
| 505 |
-
e.to_dict() for e in self.lifecycle_events],
|
| 506 |
"entanglements": self.get_entanglements(),
|
| 507 |
"luca_trace": self.get_luca_trace(),
|
| 508 |
}
|
|
@@ -515,8 +499,7 @@ class STAT7Entity(ABC):
|
|
| 515 |
"stat7_dimensions": self.stat7.to_dict(),
|
| 516 |
"realm_details": self._get_realm_details(),
|
| 517 |
"entanglement_network": self.get_entanglements(),
|
| 518 |
-
"event_chronology": [
|
| 519 |
-
e.to_dict() for e in self.lifecycle_events],
|
| 520 |
}
|
| 521 |
|
| 522 |
def _get_realm_details(self) -> Dict[str, Any]:
|
|
|
|
| 251 |
# Event Tracking
|
| 252 |
# ========================================================================
|
| 253 |
|
| 254 |
+
def _record_event(self, event_type: str, description: str, metadata: Dict[str, Any] = None):
|
|
|
|
| 255 |
"""Record a lifecycle event"""
|
| 256 |
event = LifecycleEvent(
|
| 257 |
timestamp=datetime.utcnow(),
|
|
|
|
| 262 |
self.lifecycle_events.append(event)
|
| 263 |
self.last_activity = event.timestamp
|
| 264 |
|
| 265 |
+
def get_event_history(self, limit: Optional[int] = None) -> List[LifecycleEvent]:
|
|
|
|
|
|
|
| 266 |
"""Get lifecycle events, optionally limited to most recent"""
|
| 267 |
+
events = sorted(self.lifecycle_events, key=lambda e: e.timestamp, reverse=True)
|
|
|
|
| 268 |
return events[:limit] if limit else events
|
| 269 |
|
| 270 |
# ========================================================================
|
|
|
|
| 280 |
self.entangled_entities.append(other_entity_id)
|
| 281 |
self.entanglement_strength.append(strength)
|
| 282 |
self._record_event(
|
| 283 |
+
"entanglement_added", f"Entangled with {other_entity_id}", {"strength": strength}
|
|
|
|
| 284 |
)
|
| 285 |
|
| 286 |
def remove_entanglement(self, other_entity_id: str):
|
|
|
|
| 289 |
idx = self.entangled_entities.index(other_entity_id)
|
| 290 |
self.entangled_entities.pop(idx)
|
| 291 |
self.entanglement_strength.pop(idx)
|
| 292 |
+
self._record_event("entanglement_removed", f"Untangled from {other_entity_id}")
|
|
|
|
| 293 |
|
| 294 |
def get_entanglements(self) -> List[Tuple[str, float]]:
|
| 295 |
"""Get all entangled entities with strength"""
|
| 296 |
return list(zip(self.entangled_entities, self.entanglement_strength))
|
| 297 |
|
| 298 |
+
def update_entanglement_strength(self, other_entity_id: str, new_strength: float):
|
|
|
|
|
|
|
|
|
|
| 299 |
"""Update entanglement strength with another entity"""
|
| 300 |
if other_entity_id in self.entangled_entities:
|
| 301 |
idx = self.entangled_entities.index(other_entity_id)
|
|
|
|
| 356 |
{"trait_type": "Realm", "value": self.stat7.realm.value},
|
| 357 |
{"trait_type": "Lineage", "value": self.stat7.lineage},
|
| 358 |
{"trait_type": "Horizon", "value": self.stat7.horizon.value},
|
| 359 |
+
{"trait_type": "Luminosity", "value": int(self.stat7.luminosity)},
|
|
|
|
| 360 |
{"trait_type": "Polarity", "value": self.stat7.polarity.value},
|
| 361 |
{"trait_type": "Dimensionality", "value": self.stat7.dimensionality},
|
| 362 |
{"trait_type": "STAT7 Address", "value": self.stat7.address},
|
|
|
|
| 364 |
"properties": card_data.get("properties", {}),
|
| 365 |
}
|
| 366 |
|
| 367 |
+
def record_mint(self, contract_address: str, token_id: int, ipfs_hash: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
"""Record successful NFT minting"""
|
| 369 |
self.nft_minted = True
|
| 370 |
self.nft_contract = contract_address
|
|
|
|
| 373 |
self._record_event(
|
| 374 |
"nft_minted",
|
| 375 |
f"Minted as ERC-721 token #{token_id}",
|
| 376 |
+
{"contract": contract_address, "token_id": token_id, "ipfs_hash": ipfs_hash},
|
|
|
|
| 377 |
)
|
| 378 |
|
| 379 |
# ========================================================================
|
|
|
|
| 486 |
"type": "entity_profile",
|
| 487 |
**card_data,
|
| 488 |
"owner": self.owner_id,
|
| 489 |
+
"lifecycle_events": [e.to_dict() for e in self.lifecycle_events],
|
|
|
|
| 490 |
"entanglements": self.get_entanglements(),
|
| 491 |
"luca_trace": self.get_luca_trace(),
|
| 492 |
}
|
|
|
|
| 499 |
"stat7_dimensions": self.stat7.to_dict(),
|
| 500 |
"realm_details": self._get_realm_details(),
|
| 501 |
"entanglement_network": self.get_entanglements(),
|
| 502 |
+
"event_chronology": [e.to_dict() for e in self.lifecycle_events],
|
|
|
|
| 503 |
}
|
| 504 |
|
| 505 |
def _get_realm_details(self) -> Dict[str, Any]:
|
warbler_cda/stat7_experiments.py
CHANGED
|
@@ -63,8 +63,7 @@ def normalize_float(value: float, decimal_places: int = 8) -> str:
|
|
| 63 |
|
| 64 |
# Use Decimal for precise rounding
|
| 65 |
d = Decimal(str(value))
|
| 66 |
-
quantized = d.quantize(Decimal(10) ** -decimal_places,
|
| 67 |
-
rounding=ROUND_HALF_EVEN)
|
| 68 |
|
| 69 |
# Convert to string and strip trailing zeros (but keep at least one
|
| 70 |
# decimal)
|
|
@@ -138,8 +137,7 @@ def canonical_serialize(data: Dict[str, Any]) -> str:
|
|
| 138 |
sorted_data = sort_json_keys(data)
|
| 139 |
|
| 140 |
# Serialize with no whitespace, ensure_ascii=False to preserve Unicode
|
| 141 |
-
canonical = json.dumps(sorted_data, separators=(
|
| 142 |
-
",", ":"), ensure_ascii=True, sort_keys=False)
|
| 143 |
|
| 144 |
return canonical
|
| 145 |
|
|
@@ -238,8 +236,7 @@ class BitChain:
|
|
| 238 |
def get_stat7_uri(self) -> str:
|
| 239 |
"""Generate STAT7 URI address format."""
|
| 240 |
coords = self.coordinates
|
| 241 |
-
adjacency_hash = compute_address_hash(
|
| 242 |
-
{"adjacency": sorted(coords.adjacency)})[:8]
|
| 243 |
|
| 244 |
uri = f"stat7://{coords.realm}/{coords.lineage}/{adjacency_hash}/{coords.horizon}"
|
| 245 |
uri += f"?r={normalize_float(coords.resonance)}"
|
|
@@ -255,8 +252,7 @@ class BitChain:
|
|
| 255 |
|
| 256 |
REALMS = ["data", "narrative", "system", "faculty", "event", "pattern", "void"]
|
| 257 |
HORIZONS = ["genesis", "emergence", "peak", "decay", "crystallization"]
|
| 258 |
-
ENTITY_TYPES = ["concept", "artifact", "agent",
|
| 259 |
-
"lineage", "adjacency", "horizon", "fragment"]
|
| 260 |
|
| 261 |
|
| 262 |
def generate_random_bitchain(seed: Optional[int] = None) -> BitChain:
|
|
@@ -342,11 +338,8 @@ class EXP01_AddressUniqueness:
|
|
| 342 |
for iteration in range(self.iterations):
|
| 343 |
# Generate random bit-chains
|
| 344 |
bitchains = [
|
| 345 |
-
generate_random_bitchain(
|
| 346 |
-
|
| 347 |
-
1000 +
|
| 348 |
-
i) for i in range(
|
| 349 |
-
self.sample_size)]
|
| 350 |
|
| 351 |
# Compute addresses
|
| 352 |
addresses = set()
|
|
@@ -387,15 +380,14 @@ class EXP01_AddressUniqueness:
|
|
| 387 |
|
| 388 |
if collision_pairs:
|
| 389 |
for addr, ids in collision_pairs.items():
|
| 390 |
-
print(
|
| 391 |
-
f" ⚠️ Collision on {addr[:16]}... : {len(ids)} entries")
|
| 392 |
|
| 393 |
print()
|
| 394 |
print(
|
| 395 |
f"OVERALL RESULT: {
|
| 396 |
-
'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
|
| 397 |
-
|
| 398 |
-
|
| 399 |
|
| 400 |
return self.results, all_success
|
| 401 |
|
|
@@ -474,8 +466,7 @@ class EXP02_RetrievalEfficiency:
|
|
| 474 |
print(f"Testing scale: {scale:,} bit-chains")
|
| 475 |
|
| 476 |
# Generate bit-chains
|
| 477 |
-
bitchains = [generate_random_bitchain(
|
| 478 |
-
seed=i) for i in range(scale)]
|
| 479 |
|
| 480 |
# Index by address for O(1) retrieval simulation
|
| 481 |
address_to_bc = {bc.compute_address(): bc for bc in bitchains}
|
|
@@ -532,7 +523,8 @@ class EXP02_RetrievalEfficiency:
|
|
| 532 |
|
| 533 |
print(
|
| 534 |
f"OVERALL RESULT: {
|
| 535 |
-
'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
|
|
|
|
| 536 |
|
| 537 |
return self.results, all_success
|
| 538 |
|
|
@@ -606,8 +598,7 @@ class EXP03_DimensionNecessity:
|
|
| 606 |
|
| 607 |
# Baseline: all 7 dimensions
|
| 608 |
print("Baseline: All 7 dimensions")
|
| 609 |
-
bitchains = [generate_random_bitchain(
|
| 610 |
-
seed=i) for i in range(self.sample_size)]
|
| 611 |
addresses = set()
|
| 612 |
collisions = 0
|
| 613 |
|
|
@@ -631,7 +622,8 @@ class EXP03_DimensionNecessity:
|
|
| 631 |
status = "✅ PASS" if result.acceptable else "❌ FAIL"
|
| 632 |
print(
|
| 633 |
f" {status} | Collisions: {collisions} | Rate: {
|
| 634 |
-
baseline_collision_rate * 100:.4f}%"
|
|
|
|
| 635 |
print()
|
| 636 |
|
| 637 |
# Ablation: remove each dimension
|
|
@@ -661,8 +653,7 @@ class EXP03_DimensionNecessity:
|
|
| 661 |
acceptable = collision_rate < 0.001 # Should be unacceptable without each dim
|
| 662 |
|
| 663 |
result = EXP03_Result(
|
| 664 |
-
dimensions_used=[
|
| 665 |
-
d for d in self.STAT7_DIMENSIONS if d != removed_dim],
|
| 666 |
sample_size=self.sample_size,
|
| 667 |
collisions=collisions,
|
| 668 |
collision_rate=collision_rate,
|
|
@@ -676,7 +667,8 @@ class EXP03_DimensionNecessity:
|
|
| 676 |
status = "✅ NECESSARY" if necessity else "⚠️ OPTIONAL"
|
| 677 |
print(
|
| 678 |
f" {status} | Collisions: {collisions} | Rate: {
|
| 679 |
-
collision_rate * 100:.4f}%"
|
|
|
|
| 680 |
|
| 681 |
print()
|
| 682 |
print(
|
|
@@ -720,8 +712,7 @@ def run_all_experiments(
|
|
| 720 |
results = {}
|
| 721 |
|
| 722 |
# EXP-01
|
| 723 |
-
exp01 = EXP01_AddressUniqueness(
|
| 724 |
-
sample_size=exp01_samples, iterations=exp01_iterations)
|
| 725 |
_, exp01_success = exp01.run()
|
| 726 |
results["EXP-01"] = {
|
| 727 |
"success": exp01_success,
|
|
@@ -760,7 +751,8 @@ def run_all_experiments(
|
|
| 760 |
print(
|
| 761 |
f"\nOverall Phase 1 Status: {
|
| 762 |
'✅ READY FOR PHASE 2' if all(
|
| 763 |
-
r['success'] for r in results.values()) else '❌ NEEDS WORK'}"
|
|
|
|
| 764 |
|
| 765 |
return results
|
| 766 |
|
|
|
|
| 63 |
|
| 64 |
# Use Decimal for precise rounding
|
| 65 |
d = Decimal(str(value))
|
| 66 |
+
quantized = d.quantize(Decimal(10) ** -decimal_places, rounding=ROUND_HALF_EVEN)
|
|
|
|
| 67 |
|
| 68 |
# Convert to string and strip trailing zeros (but keep at least one
|
| 69 |
# decimal)
|
|
|
|
| 137 |
sorted_data = sort_json_keys(data)
|
| 138 |
|
| 139 |
# Serialize with no whitespace, ensure_ascii=False to preserve Unicode
|
| 140 |
+
canonical = json.dumps(sorted_data, separators=(",", ":"), ensure_ascii=True, sort_keys=False)
|
|
|
|
| 141 |
|
| 142 |
return canonical
|
| 143 |
|
|
|
|
| 236 |
def get_stat7_uri(self) -> str:
|
| 237 |
"""Generate STAT7 URI address format."""
|
| 238 |
coords = self.coordinates
|
| 239 |
+
adjacency_hash = compute_address_hash({"adjacency": sorted(coords.adjacency)})[:8]
|
|
|
|
| 240 |
|
| 241 |
uri = f"stat7://{coords.realm}/{coords.lineage}/{adjacency_hash}/{coords.horizon}"
|
| 242 |
uri += f"?r={normalize_float(coords.resonance)}"
|
|
|
|
| 252 |
|
| 253 |
REALMS = ["data", "narrative", "system", "faculty", "event", "pattern", "void"]
|
| 254 |
HORIZONS = ["genesis", "emergence", "peak", "decay", "crystallization"]
|
| 255 |
+
ENTITY_TYPES = ["concept", "artifact", "agent", "lineage", "adjacency", "horizon", "fragment"]
|
|
|
|
| 256 |
|
| 257 |
|
| 258 |
def generate_random_bitchain(seed: Optional[int] = None) -> BitChain:
|
|
|
|
| 338 |
for iteration in range(self.iterations):
|
| 339 |
# Generate random bit-chains
|
| 340 |
bitchains = [
|
| 341 |
+
generate_random_bitchain(seed=iteration * 1000 + i) for i in range(self.sample_size)
|
| 342 |
+
]
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
# Compute addresses
|
| 345 |
addresses = set()
|
|
|
|
| 380 |
|
| 381 |
if collision_pairs:
|
| 382 |
for addr, ids in collision_pairs.items():
|
| 383 |
+
print(f" ⚠️ Collision on {addr[:16]}... : {len(ids)} entries")
|
|
|
|
| 384 |
|
| 385 |
print()
|
| 386 |
print(
|
| 387 |
f"OVERALL RESULT: {
|
| 388 |
+
'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
|
| 389 |
+
)
|
| 390 |
+
print(f"Success rate: {sum(1 for r in self.results if r.success)}/{self.iterations}")
|
| 391 |
|
| 392 |
return self.results, all_success
|
| 393 |
|
|
|
|
| 466 |
print(f"Testing scale: {scale:,} bit-chains")
|
| 467 |
|
| 468 |
# Generate bit-chains
|
| 469 |
+
bitchains = [generate_random_bitchain(seed=i) for i in range(scale)]
|
|
|
|
| 470 |
|
| 471 |
# Index by address for O(1) retrieval simulation
|
| 472 |
address_to_bc = {bc.compute_address(): bc for bc in bitchains}
|
|
|
|
| 523 |
|
| 524 |
print(
|
| 525 |
f"OVERALL RESULT: {
|
| 526 |
+
'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
|
| 527 |
+
)
|
| 528 |
|
| 529 |
return self.results, all_success
|
| 530 |
|
|
|
|
| 598 |
|
| 599 |
# Baseline: all 7 dimensions
|
| 600 |
print("Baseline: All 7 dimensions")
|
| 601 |
+
bitchains = [generate_random_bitchain(seed=i) for i in range(self.sample_size)]
|
|
|
|
| 602 |
addresses = set()
|
| 603 |
collisions = 0
|
| 604 |
|
|
|
|
| 622 |
status = "✅ PASS" if result.acceptable else "❌ FAIL"
|
| 623 |
print(
|
| 624 |
f" {status} | Collisions: {collisions} | Rate: {
|
| 625 |
+
baseline_collision_rate * 100:.4f}%"
|
| 626 |
+
)
|
| 627 |
print()
|
| 628 |
|
| 629 |
# Ablation: remove each dimension
|
|
|
|
| 653 |
acceptable = collision_rate < 0.001 # Should be unacceptable without each dim
|
| 654 |
|
| 655 |
result = EXP03_Result(
|
| 656 |
+
dimensions_used=[d for d in self.STAT7_DIMENSIONS if d != removed_dim],
|
|
|
|
| 657 |
sample_size=self.sample_size,
|
| 658 |
collisions=collisions,
|
| 659 |
collision_rate=collision_rate,
|
|
|
|
| 667 |
status = "✅ NECESSARY" if necessity else "⚠️ OPTIONAL"
|
| 668 |
print(
|
| 669 |
f" {status} | Collisions: {collisions} | Rate: {
|
| 670 |
+
collision_rate * 100:.4f}%"
|
| 671 |
+
)
|
| 672 |
|
| 673 |
print()
|
| 674 |
print(
|
|
|
|
| 712 |
results = {}
|
| 713 |
|
| 714 |
# EXP-01
|
| 715 |
+
exp01 = EXP01_AddressUniqueness(sample_size=exp01_samples, iterations=exp01_iterations)
|
|
|
|
| 716 |
_, exp01_success = exp01.run()
|
| 717 |
results["EXP-01"] = {
|
| 718 |
"success": exp01_success,
|
|
|
|
| 751 |
print(
|
| 752 |
f"\nOverall Phase 1 Status: {
|
| 753 |
'✅ READY FOR PHASE 2' if all(
|
| 754 |
+
r['success'] for r in results.values()) else '❌ NEEDS WORK'}"
|
| 755 |
+
)
|
| 756 |
|
| 757 |
return results
|
| 758 |
|
warbler_cda/stat7_rag_bridge.py
CHANGED
|
@@ -55,11 +55,17 @@ class STAT7Address:
|
|
| 55 |
|
| 56 |
def __post_init__(self):
|
| 57 |
"""Validate STAT7 constraints."""
|
| 58 |
-
assert
|
|
|
|
|
|
|
| 59 |
self.adjacency}"
|
| 60 |
-
assert
|
|
|
|
|
|
|
| 61 |
self.luminosity}"
|
| 62 |
-
assert
|
|
|
|
|
|
|
| 63 |
self.polarity}"
|
| 64 |
assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
|
| 65 |
assert (
|
|
@@ -91,8 +97,7 @@ class RAGDocument:
|
|
| 91 |
|
| 92 |
def __post_init__(self):
|
| 93 |
"""Validate document structure."""
|
| 94 |
-
assert len(
|
| 95 |
-
self.embedding) > 0, f"embedding must not be empty for {self.id}"
|
| 96 |
|
| 97 |
|
| 98 |
# ============================================================================
|
|
@@ -116,9 +121,7 @@ def cosine_similarity(a: List[float], b: List[float]) -> float:
|
|
| 116 |
return dot / denom
|
| 117 |
|
| 118 |
|
| 119 |
-
def stat7_resonance(
|
| 120 |
-
query_stat7: STAT7Address,
|
| 121 |
-
doc_stat7: STAT7Address) -> float:
|
| 122 |
"""
|
| 123 |
Compute STAT7 resonance between query and document addresses.
|
| 124 |
|
|
@@ -232,8 +235,7 @@ def retrieve(
|
|
| 232 |
"""
|
| 233 |
scores = []
|
| 234 |
for doc in documents:
|
| 235 |
-
score = hybrid_score(query_embedding, doc,
|
| 236 |
-
query_stat7, weight_semantic, weight_stat7)
|
| 237 |
scores.append((doc.id, score))
|
| 238 |
|
| 239 |
# Sort by score descending, return top-k
|
|
@@ -389,14 +391,11 @@ def compare_retrieval_results(
|
|
| 389 |
overlap = len(semantic_ids & hybrid_ids)
|
| 390 |
overlap_pct = (overlap / k * 100) if k > 0 else 0.0
|
| 391 |
|
| 392 |
-
semantic_avg = sum(
|
| 393 |
-
|
| 394 |
-
hybrid_avg = sum(
|
| 395 |
-
score for _, score in hybrid_results[:k]) / k if k > 0 else 0.0
|
| 396 |
|
| 397 |
# Measure ranking distance: how far did top-k items move?
|
| 398 |
-
semantic_rank = {doc_id: idx for idx,
|
| 399 |
-
(doc_id, _) in enumerate(semantic_results[:k])}
|
| 400 |
reranking_distances = []
|
| 401 |
for idx, (doc_id, _) in enumerate(hybrid_results[:k]):
|
| 402 |
if doc_id in semantic_rank:
|
|
@@ -404,8 +403,7 @@ def compare_retrieval_results(
|
|
| 404 |
reranking_distances.append(distance)
|
| 405 |
|
| 406 |
avg_reranking_distance = (
|
| 407 |
-
sum(reranking_distances) /
|
| 408 |
-
len(reranking_distances) if reranking_distances else 0.0
|
| 409 |
)
|
| 410 |
|
| 411 |
return {
|
|
@@ -434,8 +432,7 @@ class STAT7RAGBridge:
|
|
| 434 |
dependency injection.
|
| 435 |
"""
|
| 436 |
|
| 437 |
-
def stat7_resonance(self, query_stat7: STAT7Address,
|
| 438 |
-
doc_stat7: STAT7Address) -> float:
|
| 439 |
"""
|
| 440 |
Compute STAT7 resonance between query and document addresses.
|
| 441 |
|
|
@@ -467,12 +464,7 @@ class STAT7RAGBridge:
|
|
| 467 |
|
| 468 |
Returns: [0.0, 1.0] hybrid score
|
| 469 |
"""
|
| 470 |
-
return hybrid_score(
|
| 471 |
-
query_embedding,
|
| 472 |
-
doc,
|
| 473 |
-
query_stat7,
|
| 474 |
-
weight_semantic,
|
| 475 |
-
weight_stat7)
|
| 476 |
|
| 477 |
def retrieve(
|
| 478 |
self,
|
|
@@ -496,10 +488,4 @@ class STAT7RAGBridge:
|
|
| 496 |
|
| 497 |
Returns: List of (doc_id, hybrid_score) tuples, sorted by score (descending)
|
| 498 |
"""
|
| 499 |
-
return retrieve(
|
| 500 |
-
documents,
|
| 501 |
-
query_embedding,
|
| 502 |
-
query_stat7,
|
| 503 |
-
k,
|
| 504 |
-
weight_semantic,
|
| 505 |
-
weight_stat7)
|
|
|
|
| 55 |
|
| 56 |
def __post_init__(self):
|
| 57 |
"""Validate STAT7 constraints."""
|
| 58 |
+
assert (
|
| 59 |
+
0.0 <= self.adjacency <= 1.0
|
| 60 |
+
), f"adjacency must be [0,1], got {
|
| 61 |
self.adjacency}"
|
| 62 |
+
assert (
|
| 63 |
+
0.0 <= self.luminosity <= 1.0
|
| 64 |
+
), f"luminosity must be [0,1], got {
|
| 65 |
self.luminosity}"
|
| 66 |
+
assert (
|
| 67 |
+
0.0 <= self.polarity <= 1.0
|
| 68 |
+
), f"polarity must be [0,1], got {
|
| 69 |
self.polarity}"
|
| 70 |
assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
|
| 71 |
assert (
|
|
|
|
| 97 |
|
| 98 |
def __post_init__(self):
|
| 99 |
"""Validate document structure."""
|
| 100 |
+
assert len(self.embedding) > 0, f"embedding must not be empty for {self.id}"
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
# ============================================================================
|
|
|
|
| 121 |
return dot / denom
|
| 122 |
|
| 123 |
|
| 124 |
+
def stat7_resonance(query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float:
|
|
|
|
|
|
|
| 125 |
"""
|
| 126 |
Compute STAT7 resonance between query and document addresses.
|
| 127 |
|
|
|
|
| 235 |
"""
|
| 236 |
scores = []
|
| 237 |
for doc in documents:
|
| 238 |
+
score = hybrid_score(query_embedding, doc, query_stat7, weight_semantic, weight_stat7)
|
|
|
|
| 239 |
scores.append((doc.id, score))
|
| 240 |
|
| 241 |
# Sort by score descending, return top-k
|
|
|
|
| 391 |
overlap = len(semantic_ids & hybrid_ids)
|
| 392 |
overlap_pct = (overlap / k * 100) if k > 0 else 0.0
|
| 393 |
|
| 394 |
+
semantic_avg = sum(score for _, score in semantic_results[:k]) / k if k > 0 else 0.0
|
| 395 |
+
hybrid_avg = sum(score for _, score in hybrid_results[:k]) / k if k > 0 else 0.0
|
|
|
|
|
|
|
| 396 |
|
| 397 |
# Measure ranking distance: how far did top-k items move?
|
| 398 |
+
semantic_rank = {doc_id: idx for idx, (doc_id, _) in enumerate(semantic_results[:k])}
|
|
|
|
| 399 |
reranking_distances = []
|
| 400 |
for idx, (doc_id, _) in enumerate(hybrid_results[:k]):
|
| 401 |
if doc_id in semantic_rank:
|
|
|
|
| 403 |
reranking_distances.append(distance)
|
| 404 |
|
| 405 |
avg_reranking_distance = (
|
| 406 |
+
sum(reranking_distances) / len(reranking_distances) if reranking_distances else 0.0
|
|
|
|
| 407 |
)
|
| 408 |
|
| 409 |
return {
|
|
|
|
| 432 |
dependency injection.
|
| 433 |
"""
|
| 434 |
|
| 435 |
+
def stat7_resonance(self, query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float:
|
|
|
|
| 436 |
"""
|
| 437 |
Compute STAT7 resonance between query and document addresses.
|
| 438 |
|
|
|
|
| 464 |
|
| 465 |
Returns: [0.0, 1.0] hybrid score
|
| 466 |
"""
|
| 467 |
+
return hybrid_score(query_embedding, doc, query_stat7, weight_semantic, weight_stat7)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
def retrieve(
|
| 470 |
self,
|
|
|
|
| 488 |
|
| 489 |
Returns: List of (doc_id, hybrid_score) tuples, sorted by score (descending)
|
| 490 |
"""
|
| 491 |
+
return retrieve(documents, query_embedding, query_stat7, k, weight_semantic, weight_stat7)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
warbler_cda/utils/load_warbler_packs.py
CHANGED
|
@@ -52,26 +52,19 @@ class WarblerPackLoader:
|
|
| 52 |
return []
|
| 53 |
|
| 54 |
# Look for JSON, YAML, markdown, and JSONL files
|
| 55 |
-
for pattern in [
|
| 56 |
-
"**/*.json",
|
| 57 |
-
"**/*.yaml",
|
| 58 |
-
"**/*.yml",
|
| 59 |
-
"**/*.md",
|
| 60 |
-
"**/*.jsonl"]:
|
| 61 |
for file_path in pack_path.glob(pattern):
|
| 62 |
try:
|
| 63 |
doc = self._parse_document(file_path, pack_name)
|
| 64 |
if doc:
|
| 65 |
documents.append(doc)
|
| 66 |
-
logger.info(
|
| 67 |
-
f"Discovered: {file_path.relative_to(PACKS_DIR)}")
|
| 68 |
except Exception as e:
|
| 69 |
logger.error(f"Error parsing {file_path}: {e}")
|
| 70 |
|
| 71 |
return documents
|
| 72 |
|
| 73 |
-
def _parse_document(self, file_path: Path,
|
| 74 |
-
pack_name: str) -> Dict[str, Any]:
|
| 75 |
"""Parse a document file"""
|
| 76 |
try:
|
| 77 |
if file_path.suffix in [".json"]:
|
|
@@ -86,8 +79,7 @@ class WarblerPackLoader:
|
|
| 86 |
# We'll read the first few lines and combine them
|
| 87 |
with open(file_path, "r", encoding="utf-8") as f:
|
| 88 |
lines = f.readlines()[:5] # First 5 lines
|
| 89 |
-
content = "\n".join(line.strip()
|
| 90 |
-
for line in lines if line.strip())
|
| 91 |
elif file_path.suffix in [".yaml", ".yml"]:
|
| 92 |
import yaml
|
| 93 |
|
|
@@ -141,8 +133,7 @@ class WarblerPackLoader:
|
|
| 141 |
logger.info(f"[OK] Loaded: {doc['content_id']}")
|
| 142 |
return True
|
| 143 |
else:
|
| 144 |
-
logger.warning(
|
| 145 |
-
f"API returned {response.status_code}: {response.text[:200]}")
|
| 146 |
return False
|
| 147 |
except requests.exceptions.ConnectionError:
|
| 148 |
logger.error("Cannot connect to API. Is the service running?")
|
|
@@ -174,7 +165,8 @@ class WarblerPackLoader:
|
|
| 174 |
click.secho(
|
| 175 |
f"[OK] Load Complete: {
|
| 176 |
self.loaded_count} docs ingested",
|
| 177 |
-
fg="green"
|
|
|
|
| 178 |
if self.error_count > 0:
|
| 179 |
click.secho(f"[ERROR] Errors: {self.error_count}", fg="yellow")
|
| 180 |
click.echo("=" * 60 + "\n")
|
|
@@ -189,9 +181,7 @@ def cli():
|
|
| 189 |
|
| 190 |
|
| 191 |
@cli.command()
|
| 192 |
-
@click.option("--api-url",
|
| 193 |
-
default="http://localhost:8000",
|
| 194 |
-
help="API service URL")
|
| 195 |
def load(api_url):
|
| 196 |
"""Load all Warbler packs into the API"""
|
| 197 |
loader = WarblerPackLoader(api_url)
|
|
@@ -202,8 +192,7 @@ def load(api_url):
|
|
| 202 |
if response.status_code == 200:
|
| 203 |
click.secho("[OK] API service is running", fg="green")
|
| 204 |
else:
|
| 205 |
-
click.secho(
|
| 206 |
-
"[ERROR] API service not responding correctly", fg="red")
|
| 207 |
return
|
| 208 |
except Exception as e:
|
| 209 |
click.secho(f"[ERROR] Cannot reach API at {api_url}: {e}", fg="red")
|
|
@@ -225,9 +214,7 @@ def load(api_url):
|
|
| 225 |
|
| 226 |
|
| 227 |
@cli.command()
|
| 228 |
-
@click.option("--api-url",
|
| 229 |
-
default="http://localhost:8000",
|
| 230 |
-
help="API service URL")
|
| 231 |
def discover(api_url):
|
| 232 |
"""Discover documents in Warbler packs (no loading)"""
|
| 233 |
loader = WarblerPackLoader(api_url)
|
|
@@ -251,7 +238,8 @@ def discover(api_url):
|
|
| 251 |
f" Realm: {
|
| 252 |
doc['metadata'].get(
|
| 253 |
'realm_type',
|
| 254 |
-
'unknown')}"
|
|
|
|
| 255 |
|
| 256 |
click.echo(f"\n[STATS] Total discovered: {total} documents\n")
|
| 257 |
|
|
|
|
| 52 |
return []
|
| 53 |
|
| 54 |
# Look for JSON, YAML, markdown, and JSONL files
|
| 55 |
+
for pattern in ["**/*.json", "**/*.yaml", "**/*.yml", "**/*.md", "**/*.jsonl"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
for file_path in pack_path.glob(pattern):
|
| 57 |
try:
|
| 58 |
doc = self._parse_document(file_path, pack_name)
|
| 59 |
if doc:
|
| 60 |
documents.append(doc)
|
| 61 |
+
logger.info(f"Discovered: {file_path.relative_to(PACKS_DIR)}")
|
|
|
|
| 62 |
except Exception as e:
|
| 63 |
logger.error(f"Error parsing {file_path}: {e}")
|
| 64 |
|
| 65 |
return documents
|
| 66 |
|
| 67 |
+
def _parse_document(self, file_path: Path, pack_name: str) -> Dict[str, Any]:
|
|
|
|
| 68 |
"""Parse a document file"""
|
| 69 |
try:
|
| 70 |
if file_path.suffix in [".json"]:
|
|
|
|
| 79 |
# We'll read the first few lines and combine them
|
| 80 |
with open(file_path, "r", encoding="utf-8") as f:
|
| 81 |
lines = f.readlines()[:5] # First 5 lines
|
| 82 |
+
content = "\n".join(line.strip() for line in lines if line.strip())
|
|
|
|
| 83 |
elif file_path.suffix in [".yaml", ".yml"]:
|
| 84 |
import yaml
|
| 85 |
|
|
|
|
| 133 |
logger.info(f"[OK] Loaded: {doc['content_id']}")
|
| 134 |
return True
|
| 135 |
else:
|
| 136 |
+
logger.warning(f"API returned {response.status_code}: {response.text[:200]}")
|
|
|
|
| 137 |
return False
|
| 138 |
except requests.exceptions.ConnectionError:
|
| 139 |
logger.error("Cannot connect to API. Is the service running?")
|
|
|
|
| 165 |
click.secho(
|
| 166 |
f"[OK] Load Complete: {
|
| 167 |
self.loaded_count} docs ingested",
|
| 168 |
+
fg="green",
|
| 169 |
+
)
|
| 170 |
if self.error_count > 0:
|
| 171 |
click.secho(f"[ERROR] Errors: {self.error_count}", fg="yellow")
|
| 172 |
click.echo("=" * 60 + "\n")
|
|
|
|
| 181 |
|
| 182 |
|
| 183 |
@cli.command()
|
| 184 |
+
@click.option("--api-url", default="http://localhost:8000", help="API service URL")
|
|
|
|
|
|
|
| 185 |
def load(api_url):
|
| 186 |
"""Load all Warbler packs into the API"""
|
| 187 |
loader = WarblerPackLoader(api_url)
|
|
|
|
| 192 |
if response.status_code == 200:
|
| 193 |
click.secho("[OK] API service is running", fg="green")
|
| 194 |
else:
|
| 195 |
+
click.secho("[ERROR] API service not responding correctly", fg="red")
|
|
|
|
| 196 |
return
|
| 197 |
except Exception as e:
|
| 198 |
click.secho(f"[ERROR] Cannot reach API at {api_url}: {e}", fg="red")
|
|
|
|
| 214 |
|
| 215 |
|
| 216 |
@cli.command()
|
| 217 |
+
@click.option("--api-url", default="http://localhost:8000", help="API service URL")
|
|
|
|
|
|
|
| 218 |
def discover(api_url):
|
| 219 |
"""Discover documents in Warbler packs (no loading)"""
|
| 220 |
loader = WarblerPackLoader(api_url)
|
|
|
|
| 238 |
f" Realm: {
|
| 239 |
doc['metadata'].get(
|
| 240 |
'realm_type',
|
| 241 |
+
'unknown')}"
|
| 242 |
+
)
|
| 243 |
|
| 244 |
click.echo(f"\n[STATS] Total discovered: {total} documents\n")
|
| 245 |
|