jmeyer1980 commited on
Commit
2133289
·
1 Parent(s): 9380cea

Fix syntax errors in app.py preventing Gradio app startup on HF Spaces

Browse files

- Fix unterminated f-string literals that were split across multiple lines
- Consolidate f-strings onto single lines to resolve SyntaxError
- This restores the interactive UI for the Warbler CDA demo

app.py CHANGED
@@ -66,12 +66,9 @@ class PackManager:
66
  self.cache_dir = Path.home() / ".warbler_cda" / "cache"
67
  self.cache_dir.mkdir(parents=True, exist_ok=True)
68
  self.metadata_file = self.cache_dir / "pack_metadata.json"
69
- self.skip_cache = os.getenv(
70
- "WARBLER_SKIP_PACK_CACHE", "").lower() == "true"
71
- self.sample_only = os.getenv(
72
- "WARBLER_SAMPLE_ONLY", "").lower() == "true"
73
- self.ingest_packs = os.getenv(
74
- "WARBLER_INGEST_PACKS", "true").lower() == "true"
75
 
76
  def _load_metadata(self) -> Optional[Dict]:
77
  if not self.metadata_file.exists():
@@ -146,8 +143,7 @@ api = None
146
  if WARBLER_AVAILABLE:
147
  try:
148
  embedding_provider = EmbeddingProviderFactory.get_default_provider()
149
- semantic_anchors = SemanticAnchorGraph(
150
- embedding_provider=embedding_provider)
151
  stat7_bridge = STAT7RAGBridge()
152
 
153
  api = RetrievalAPI(
@@ -176,31 +172,24 @@ if WARBLER_AVAILABLE:
176
  pack_loader = PackLoader()
177
  pack_docs = pack_loader.discover_documents()
178
 
179
- if pack_docs and pack_manager.should_ingest_packs(
180
- api, len(pack_docs)):
181
- print(
182
- f"[INFO] Ingesting {len(pack_docs)} documents from Warbler packs...")
183
  for doc in pack_docs:
184
- success = api.add_document(
185
- doc["id"], doc["content"], doc["metadata"])
186
  if not success:
187
  print(f"[WARN] Failed to add document {doc['id']}")
188
  packs_loaded = len(pack_docs)
189
  pack_manager.mark_packs_ingested(1, packs_loaded)
190
- print(
191
- f"[OK] Loaded {packs_loaded} documents from Warbler packs")
192
 
193
  elif pack_docs:
194
  packs_loaded = len(pack_docs)
195
- print(
196
- f"[INFO] Using cached pack data ({packs_loaded} documents)")
197
 
198
  else:
199
- print(
200
- "[INFO] No Warbler packs found. Using sample documents instead.")
201
  for doc in SAMPLE_DOCS:
202
- api.add_document(
203
- doc["id"], doc["content"], doc["metadata"])
204
  packs_loaded = len(SAMPLE_DOCS)
205
  print(f"[OK] Loaded {packs_loaded} sample documents")
206
 
@@ -252,14 +241,14 @@ def query_warbler(
252
  weight_stat7=weight_stat7,
253
  )
254
 
255
- print(
256
- f"DEBUG: Query created - ID: {query.query_id}, Text: {query_text}")
257
 
258
  # Execute query
259
  assembly = api.retrieve_context(query)
260
 
261
  print(
262
- f"DEBUG: Retrieved {len(assembly.results)} results, Assembly ID: {assembly.assembly_id}")
 
263
 
264
  elapsed_ms = (time.time() - start_time) * 1000
265
 
@@ -323,11 +312,7 @@ def query_warbler(
323
  return f"Error: {str(e)}", json.dumps({"error": str(e)}, indent=2)
324
 
325
 
326
- def add_document(
327
- doc_id: str,
328
- content: str,
329
- realm_type: str,
330
- realm_label: str) -> str:
331
  """Add a new document to the system"""
332
  if not WARBLER_AVAILABLE or not api:
333
  return "Warbler CDA not available."
@@ -377,8 +362,7 @@ def get_system_stats() -> str:
377
  metrics['retrieval_metrics']['hybrid_queries']}\n\n"
378
 
379
  stats += "## Quality Distribution\n\n"
380
- for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items(
381
- ):
382
  stats += f"- {quality.capitalize()}: {count}\n"
383
 
384
  return stats
@@ -413,19 +397,17 @@ with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as
413
 
414
  with gr.Row():
415
  max_results = gr.Slider(
416
- minimum=1, maximum=10, value=5, step=1, label="Max Results")
417
- use_hybrid = gr.Checkbox(
418
- label="Enable STAT7 Hybrid Scoring", value=True)
419
 
420
  with gr.Row():
421
  weight_semantic = gr.Slider(
422
- minimum=0.0, maximum=1.0, value=0.6, step=0.1, label="Semantic Weight")
 
423
  weight_stat7 = gr.Slider(
424
- minimum=0.0,
425
- maximum=1.0,
426
- value=0.4,
427
- step=0.1,
428
- label="STAT7 Weight")
429
 
430
  query_btn = gr.Button("Search", variant="primary")
431
 
@@ -449,38 +431,34 @@ with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as
449
 
450
  query_btn.click(
451
  fn=query_warbler,
452
- inputs=[query_input, max_results, use_hybrid,
453
- weight_semantic, weight_stat7],
454
  outputs=[results_output, metrics_output],
455
  )
456
 
457
  with gr.Tab("Add Document"):
458
  with gr.Row():
459
  with gr.Column():
460
- doc_id_input = gr.Textbox(
461
- label="Document ID", placeholder="unique_doc_id")
462
  content_input = gr.Textbox(
463
- label="Content",
464
- placeholder="Enter document content...",
465
- lines=5)
466
 
467
  with gr.Row():
468
  realm_type_input = gr.Dropdown(
469
- choices=["wisdom", "technical",
470
- "narrative", "pattern", "data"],
471
  value="wisdom",
472
  label="Realm Type",
473
  )
474
  realm_label_input = gr.Textbox(
475
- label="Realm Label", placeholder="e.g., philosophy, documentation")
 
476
 
477
  add_btn = gr.Button("Add Document", variant="primary")
478
  add_output = gr.Textbox(label="Status", lines=3)
479
 
480
  add_btn.click(
481
  fn=add_document,
482
- inputs=[doc_id_input, content_input,
483
- realm_type_input, realm_label_input],
484
  outputs=add_output,
485
  )
486
 
 
66
  self.cache_dir = Path.home() / ".warbler_cda" / "cache"
67
  self.cache_dir.mkdir(parents=True, exist_ok=True)
68
  self.metadata_file = self.cache_dir / "pack_metadata.json"
69
+ self.skip_cache = os.getenv("WARBLER_SKIP_PACK_CACHE", "").lower() == "true"
70
+ self.sample_only = os.getenv("WARBLER_SAMPLE_ONLY", "").lower() == "true"
71
+ self.ingest_packs = os.getenv("WARBLER_INGEST_PACKS", "true").lower() == "true"
 
 
 
72
 
73
  def _load_metadata(self) -> Optional[Dict]:
74
  if not self.metadata_file.exists():
 
143
  if WARBLER_AVAILABLE:
144
  try:
145
  embedding_provider = EmbeddingProviderFactory.get_default_provider()
146
+ semantic_anchors = SemanticAnchorGraph(embedding_provider=embedding_provider)
 
147
  stat7_bridge = STAT7RAGBridge()
148
 
149
  api = RetrievalAPI(
 
172
  pack_loader = PackLoader()
173
  pack_docs = pack_loader.discover_documents()
174
 
175
+ if pack_docs and pack_manager.should_ingest_packs(api, len(pack_docs)):
176
+ print(f"[INFO] Ingesting {len(pack_docs)} documents from Warbler packs...")
 
 
177
  for doc in pack_docs:
178
+ success = api.add_document(doc["id"], doc["content"], doc["metadata"])
 
179
  if not success:
180
  print(f"[WARN] Failed to add document {doc['id']}")
181
  packs_loaded = len(pack_docs)
182
  pack_manager.mark_packs_ingested(1, packs_loaded)
183
+ print(f"[OK] Loaded {packs_loaded} documents from Warbler packs")
 
184
 
185
  elif pack_docs:
186
  packs_loaded = len(pack_docs)
187
+ print(f"[INFO] Using cached pack data ({packs_loaded} documents)")
 
188
 
189
  else:
190
+ print("[INFO] No Warbler packs found. Using sample documents instead.")
 
191
  for doc in SAMPLE_DOCS:
192
+ api.add_document(doc["id"], doc["content"], doc["metadata"])
 
193
  packs_loaded = len(SAMPLE_DOCS)
194
  print(f"[OK] Loaded {packs_loaded} sample documents")
195
 
 
241
  weight_stat7=weight_stat7,
242
  )
243
 
244
+ print(f"DEBUG: Query created - ID: {query.query_id}, Text: {query_text}")
 
245
 
246
  # Execute query
247
  assembly = api.retrieve_context(query)
248
 
249
  print(
250
+ f"DEBUG: Retrieved {len(assembly.results)} results, Assembly ID: {assembly.assembly_id}"
251
+ )
252
 
253
  elapsed_ms = (time.time() - start_time) * 1000
254
 
 
312
  return f"Error: {str(e)}", json.dumps({"error": str(e)}, indent=2)
313
 
314
 
315
+ def add_document(doc_id: str, content: str, realm_type: str, realm_label: str) -> str:
 
 
 
 
316
  """Add a new document to the system"""
317
  if not WARBLER_AVAILABLE or not api:
318
  return "Warbler CDA not available."
 
362
  metrics['retrieval_metrics']['hybrid_queries']}\n\n"
363
 
364
  stats += "## Quality Distribution\n\n"
365
+ for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items():
 
366
  stats += f"- {quality.capitalize()}: {count}\n"
367
 
368
  return stats
 
397
 
398
  with gr.Row():
399
  max_results = gr.Slider(
400
+ minimum=1, maximum=10, value=5, step=1, label="Max Results"
401
+ )
402
+ use_hybrid = gr.Checkbox(label="Enable STAT7 Hybrid Scoring", value=True)
403
 
404
  with gr.Row():
405
  weight_semantic = gr.Slider(
406
+ minimum=0.0, maximum=1.0, value=0.6, step=0.1, label="Semantic Weight"
407
+ )
408
  weight_stat7 = gr.Slider(
409
+ minimum=0.0, maximum=1.0, value=0.4, step=0.1, label="STAT7 Weight"
410
+ )
 
 
 
411
 
412
  query_btn = gr.Button("Search", variant="primary")
413
 
 
431
 
432
  query_btn.click(
433
  fn=query_warbler,
434
+ inputs=[query_input, max_results, use_hybrid, weight_semantic, weight_stat7],
 
435
  outputs=[results_output, metrics_output],
436
  )
437
 
438
  with gr.Tab("Add Document"):
439
  with gr.Row():
440
  with gr.Column():
441
+ doc_id_input = gr.Textbox(label="Document ID", placeholder="unique_doc_id")
 
442
  content_input = gr.Textbox(
443
+ label="Content", placeholder="Enter document content...", lines=5
444
+ )
 
445
 
446
  with gr.Row():
447
  realm_type_input = gr.Dropdown(
448
+ choices=["wisdom", "technical", "narrative", "pattern", "data"],
 
449
  value="wisdom",
450
  label="Realm Type",
451
  )
452
  realm_label_input = gr.Textbox(
453
+ label="Realm Label", placeholder="e.g., philosophy, documentation"
454
+ )
455
 
456
  add_btn = gr.Button("Add Document", variant="primary")
457
  add_output = gr.Textbox(label="Status", lines=3)
458
 
459
  add_btn.click(
460
  fn=add_document,
461
+ inputs=[doc_id_input, content_input, realm_type_input, realm_label_input],
 
462
  outputs=add_output,
463
  )
464
 
convert_to_jsonl.py CHANGED
@@ -1,31 +1,33 @@
1
  import json
2
  import os
3
 
 
4
  def convert_templates_to_jsonl(pack_dir):
5
  """Convert templates.json to pack_name.jsonl for a given pack directory."""
6
  pack_name = os.path.basename(pack_dir)
7
- templates_path = os.path.join(pack_dir, 'pack', 'templates.json')
8
- jsonl_path = os.path.join(pack_dir, f'{pack_name}.jsonl')
9
-
10
  if not os.path.exists(templates_path):
11
  print(f"No templates.json found in {pack_dir}")
12
  return
13
-
14
- with open(templates_path, 'r') as f:
15
  templates = json.load(f)
16
-
17
- with open(jsonl_path, 'w') as f:
18
  for template in templates:
19
  json.dump(template, f)
20
- f.write('\n')
21
-
22
  print(f"Converted {templates_path} to {jsonl_path}")
23
 
 
24
  # Convert the three default packs
25
  packs_to_convert = [
26
- 'packs/warbler-pack-core',
27
- 'packs/warbler-pack-faction-politics',
28
- 'packs/warbler-pack-wisdom-scrolls'
29
  ]
30
 
31
  for pack in packs_to_convert:
 
1
  import json
2
  import os
3
 
4
+
5
  def convert_templates_to_jsonl(pack_dir):
6
  """Convert templates.json to pack_name.jsonl for a given pack directory."""
7
  pack_name = os.path.basename(pack_dir)
8
+ templates_path = os.path.join(pack_dir, "pack", "templates.json")
9
+ jsonl_path = os.path.join(pack_dir, f"{pack_name}.jsonl")
10
+
11
  if not os.path.exists(templates_path):
12
  print(f"No templates.json found in {pack_dir}")
13
  return
14
+
15
+ with open(templates_path, "r") as f:
16
  templates = json.load(f)
17
+
18
+ with open(jsonl_path, "w") as f:
19
  for template in templates:
20
  json.dump(template, f)
21
+ f.write("\n")
22
+
23
  print(f"Converted {templates_path} to {jsonl_path}")
24
 
25
+
26
  # Convert the three default packs
27
  packs_to_convert = [
28
+ "packs/warbler-pack-core",
29
+ "packs/warbler-pack-faction-politics",
30
+ "packs/warbler-pack-wisdom-scrolls",
31
  ]
32
 
33
  for pack in packs_to_convert:
test_embedding_integration.py CHANGED
@@ -26,7 +26,9 @@ def test_embedding_provider_factory():
26
  def test_sentence_transformer_provider():
27
  """Test SentenceTransformer provider directly"""
28
  try:
29
- from warbler_cda.embeddings.sentence_transformer_provider import SentenceTransformerEmbeddingProvider
 
 
30
 
31
  provider = SentenceTransformerEmbeddingProvider()
32
 
@@ -39,7 +41,7 @@ def test_sentence_transformer_provider():
39
  print(f" - Embedding dimension: {len(embedding)}")
40
  print(f" - Embedding shape: {provider.get_dimension()}")
41
 
42
- if hasattr(provider, 'compute_stat7_from_embedding'):
43
  stat7_coords = provider.compute_stat7_from_embedding(embedding)
44
  print(f" - STAT7 coordinates computed: {list(stat7_coords.keys())}")
45
 
@@ -47,6 +49,7 @@ def test_sentence_transformer_provider():
47
  except Exception as e:
48
  print(f"✗ Failed to create SentenceTransformer provider: {e}")
49
  import traceback
 
50
  traceback.print_exc()
51
  return False
52
 
@@ -59,20 +62,19 @@ def test_retrieval_api_with_embeddings():
59
 
60
  embedding_provider = EmbeddingProviderFactory.get_default_provider()
61
  api = RetrievalAPI(
62
- embedding_provider=embedding_provider,
63
- config={"enable_stat7_hybrid": True}
64
  )
65
 
66
  doc1 = {
67
  "id": "doc1",
68
  "content": "The quick brown fox jumps over the lazy dog.",
69
- "metadata": {"type": "test"}
70
  }
71
 
72
  doc2 = {
73
- "id": "doc2",
74
  "content": "Semantic embeddings enable efficient document retrieval.",
75
- "metadata": {"type": "test"}
76
  }
77
 
78
  api.add_document(doc1["id"], doc1["content"], doc1["metadata"])
@@ -86,7 +88,7 @@ def test_retrieval_api_with_embeddings():
86
  mode=RetrievalMode.SEMANTIC_SIMILARITY,
87
  semantic_query="fast animal jumps",
88
  max_results=5,
89
- confidence_threshold=0.3
90
  )
91
 
92
  assembly = api.retrieve_context(query)
@@ -98,6 +100,7 @@ def test_retrieval_api_with_embeddings():
98
  except Exception as e:
99
  print(f"✗ Failed RetrievalAPI test: {e}")
100
  import traceback
 
101
  traceback.print_exc()
102
  return False
103
 
@@ -105,7 +108,9 @@ def test_retrieval_api_with_embeddings():
105
  def test_embedding_cache():
106
  """Test embedding cache functionality"""
107
  try:
108
- from warbler_cda.embeddings.sentence_transformer_provider import SentenceTransformerEmbeddingProvider
 
 
109
 
110
  provider = SentenceTransformerEmbeddingProvider()
111
 
@@ -141,12 +146,10 @@ def main():
141
  results.append(("Factory", test_embedding_provider_factory()))
142
 
143
  print("\n2. Testing SentenceTransformer Provider...")
144
- results.append(("SentenceTransformer",
145
- test_sentence_transformer_provider()))
146
 
147
  print("\n3. Testing RetrievalAPI Integration...")
148
- results.append(("RetrievalAPI",
149
- test_retrieval_api_with_embeddings()))
150
 
151
  print("\n4. Testing Embedding Cache...")
152
  results.append(("Cache", test_embedding_cache()))
 
26
  def test_sentence_transformer_provider():
27
  """Test SentenceTransformer provider directly"""
28
  try:
29
+ from warbler_cda.embeddings.sentence_transformer_provider import (
30
+ SentenceTransformerEmbeddingProvider,
31
+ )
32
 
33
  provider = SentenceTransformerEmbeddingProvider()
34
 
 
41
  print(f" - Embedding dimension: {len(embedding)}")
42
  print(f" - Embedding shape: {provider.get_dimension()}")
43
 
44
+ if hasattr(provider, "compute_stat7_from_embedding"):
45
  stat7_coords = provider.compute_stat7_from_embedding(embedding)
46
  print(f" - STAT7 coordinates computed: {list(stat7_coords.keys())}")
47
 
 
49
  except Exception as e:
50
  print(f"✗ Failed to create SentenceTransformer provider: {e}")
51
  import traceback
52
+
53
  traceback.print_exc()
54
  return False
55
 
 
62
 
63
  embedding_provider = EmbeddingProviderFactory.get_default_provider()
64
  api = RetrievalAPI(
65
+ embedding_provider=embedding_provider, config={"enable_stat7_hybrid": True}
 
66
  )
67
 
68
  doc1 = {
69
  "id": "doc1",
70
  "content": "The quick brown fox jumps over the lazy dog.",
71
+ "metadata": {"type": "test"},
72
  }
73
 
74
  doc2 = {
75
+ "id": "doc2",
76
  "content": "Semantic embeddings enable efficient document retrieval.",
77
+ "metadata": {"type": "test"},
78
  }
79
 
80
  api.add_document(doc1["id"], doc1["content"], doc1["metadata"])
 
88
  mode=RetrievalMode.SEMANTIC_SIMILARITY,
89
  semantic_query="fast animal jumps",
90
  max_results=5,
91
+ confidence_threshold=0.3,
92
  )
93
 
94
  assembly = api.retrieve_context(query)
 
100
  except Exception as e:
101
  print(f"✗ Failed RetrievalAPI test: {e}")
102
  import traceback
103
+
104
  traceback.print_exc()
105
  return False
106
 
 
108
  def test_embedding_cache():
109
  """Test embedding cache functionality"""
110
  try:
111
+ from warbler_cda.embeddings.sentence_transformer_provider import (
112
+ SentenceTransformerEmbeddingProvider,
113
+ )
114
 
115
  provider = SentenceTransformerEmbeddingProvider()
116
 
 
146
  results.append(("Factory", test_embedding_provider_factory()))
147
 
148
  print("\n2. Testing SentenceTransformer Provider...")
149
+ results.append(("SentenceTransformer", test_sentence_transformer_provider()))
 
150
 
151
  print("\n3. Testing RetrievalAPI Integration...")
152
+ results.append(("RetrievalAPI", test_retrieval_api_with_embeddings()))
 
153
 
154
  print("\n4. Testing Embedding Cache...")
155
  results.append(("Cache", test_embedding_cache()))
warbler_cda/api/cli.py CHANGED
@@ -38,25 +38,20 @@ class APIClient:
38
 
39
  def single_query(self, query_data: Dict[str, Any]) -> Dict[str, Any]:
40
  """Execute single query"""
41
- response = self.session.post(
42
- f"{self.base_url}/query", json=query_data, timeout=30)
43
  response.raise_for_status()
44
  return response.json()
45
 
46
- def bulk_query(self,
47
- queries: List[Dict[str,
48
- Any]],
49
- concurrency: int = 5,
50
- include_narrative: bool = False) -> Dict[str,
51
- Any]:
52
  """Execute bulk concurrent queries"""
53
  payload = {
54
  "queries": queries,
55
  "concurrency_level": concurrency,
56
  "include_narrative_analysis": include_narrative,
57
  }
58
- response = self.session.post(
59
- f"{self.base_url}/bulk_query", json=payload, timeout=120)
60
  response.raise_for_status()
61
  return response.json()
62
 
@@ -68,16 +63,13 @@ class APIClient:
68
 
69
  def reset_metrics(self) -> Dict[str, Any]:
70
  """Reset service metrics"""
71
- response = self.session.post(
72
- f"{self.base_url}/metrics/reset", timeout=5)
73
  response.raise_for_status()
74
  return response.json()
75
 
76
 
77
  @click.group()
78
- @click.option("--api-url",
79
- default="http://localhost:8000",
80
- help="API service URL")
81
  @click.pass_context
82
  def cli(ctx, api_url):
83
  """EXP-09 CLI - STAT7 Retrieval API Command Line Interface"""
@@ -106,12 +98,14 @@ def health(ctx):
106
  f" Concurrent Queries: {
107
  health_data.get(
108
  'concurrent_queries',
109
- 0)}")
 
110
  click.echo(
111
  f" Max Concurrent Observed: {
112
  health_data.get(
113
  'max_concurrent_observed',
114
- 0)}")
 
115
  click.echo(f" Hybrid Queries: {health_data.get('hybrid_queries', 0)}")
116
  click.echo(f" Errors: {health_data.get('errors', 0)}")
117
  else:
@@ -127,10 +121,8 @@ def health(ctx):
127
  @click.option("--hybrid", is_flag=True, help="Enable STAT7 hybrid scoring")
128
  @click.option("--max-results", default=10, help="Maximum results to return")
129
  @click.option("--confidence", default=0.6, help="Confidence threshold")
130
- @click.option("--weight-semantic", default=0.6,
131
- help="Semantic weight in hybrid mode")
132
- @click.option("--weight-stat7", default=0.4,
133
- help="STAT7 weight in hybrid mode")
134
  @click.option("--json-output", is_flag=True, help="Output as JSON")
135
  @click.pass_context
136
  def query(
@@ -173,30 +165,26 @@ def query(
173
  click.echo(f"Query: {result.get('query_id')}")
174
  click.echo("=" * 60)
175
  click.echo(f"Results: {result.get('result_count')}")
176
- click.echo(
177
- f"Execution Time: {result.get('execution_time_ms'):.1f}ms")
178
 
179
  if result.get("semantic_similarity"):
180
  click.echo(
181
  f"Semantic Similarity: {
182
- result.get('semantic_similarity'):.3f}")
 
183
  if result.get("stat7_resonance"):
184
- click.echo(
185
- f"STAT7 Resonance: {result.get('stat7_resonance'):.3f}")
186
 
187
  # Show narrative analysis
188
  if result.get("narrative_analysis"):
189
  narr = result["narrative_analysis"]
190
  click.echo(f"\nNarrative Analysis:")
191
- click.echo(
192
- f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
193
- click.echo(
194
- f" Narrative Threads: {narr.get('narrative_threads', 0)}")
195
  click.echo(f" Analysis: {narr.get('analysis')}")
196
 
197
  # Show results
198
- click.echo(
199
- f"\nTop Results ({min(3, len(result.get('results', [])))}):")
200
  for i, res in enumerate(result.get("results", [])[:3], 1):
201
  click.echo(
202
  f" {i}. Score: {
@@ -206,7 +194,8 @@ def query(
206
  res.get(
207
  'content',
208
  'N/A')[
209
- :50]}...")
 
210
 
211
  click.echo()
212
 
@@ -217,10 +206,8 @@ def query(
217
  @cli.command()
218
  @click.option("--num-queries", default=5, help="Number of concurrent queries")
219
  @click.option("--concurrency", default=5, help="Concurrency level")
220
- @click.option("--semantic", multiple=True,
221
- help="Semantic queries (can specify multiple)")
222
- @click.option("--hybrid", is_flag=True,
223
- help="Enable STAT7 hybrid for all queries")
224
  @click.option("--json-output", is_flag=True, help="Output as JSON")
225
  @click.pass_context
226
  def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
@@ -256,11 +243,11 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
256
  try:
257
  click.echo(
258
  f"\nExecuting {
259
- len(query_data)} concurrent queries (concurrency={concurrency})...")
 
260
  start_time = time.time()
261
 
262
- result = client.bulk_query(
263
- query_data, concurrency=concurrency, include_narrative=True)
264
 
265
  elapsed = time.time() - start_time
266
 
@@ -276,21 +263,21 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
276
  click.echo(f"Failed: {result.get('failed')} ✗")
277
  click.echo(
278
  f"Total Execution Time: {
279
- result.get('execution_time_ms'):.1f}ms")
280
- click.echo(
281
- f"Avg Query Time: {result.get('avg_query_time_ms'):.1f}ms")
282
 
283
  # Narrative analysis for entire batch
284
  if result.get("batch_narrative_analysis"):
285
  narr = result["batch_narrative_analysis"]
286
  click.echo(f"\nBatch Narrative Analysis:")
287
- click.echo(
288
- f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
289
  click.echo(
290
  f" Total Narrative Threads: {
291
  narr.get(
292
  'narrative_threads',
293
- 0)}")
 
294
  click.echo(f" Total Results: {narr.get('result_count', 0)}")
295
  click.echo(f" Analysis: {narr.get('analysis')}")
296
 
@@ -301,7 +288,8 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
301
  f" {
302
  res.get('query_id')}: {
303
  res.get('result_count')} results in {
304
- res.get('execution_time_ms'):.1f}ms")
 
305
 
306
  click.echo()
307
 
@@ -329,7 +317,8 @@ def metrics(ctx, json_output):
329
  click.echo(f"Total Queries: {metrics_data.get('total_queries')}")
330
  click.echo(
331
  f"Concurrent Queries: {
332
- metrics_data.get('concurrent_queries')}")
 
333
  click.echo(f"Max Concurrent: {metrics_data.get('max_concurrent')}")
334
  click.echo(f"Hybrid Queries: {metrics_data.get('hybrid_queries')}")
335
  click.echo(f"Errors: {metrics_data.get('errors')}")
@@ -354,19 +343,12 @@ def reset_metrics(ctx):
354
 
355
 
356
  @cli.command()
357
- @click.option("--num-scenarios", default=3,
358
- help="Number of concurrent test scenarios")
359
- @click.option("--queries-per-scenario", default=10,
360
- help="Queries per scenario")
361
  @click.option("--use-hybrid", is_flag=True, help="Use STAT7 hybrid scoring")
362
  @click.option("--output-file", help="Save results to file")
363
  @click.pass_context
364
- def stress_test(
365
- ctx,
366
- num_scenarios,
367
- queries_per_scenario,
368
- use_hybrid,
369
- output_file):
370
  """Run EXP-10 narrative preservation stress test"""
371
  client = ctx.obj["client"]
372
 
@@ -415,8 +397,7 @@ def stress_test(
415
  )
416
 
417
  try:
418
- result = client.bulk_query(
419
- queries, concurrency=10, include_narrative=True)
420
 
421
  scenario_result = {
422
  "scenario": scenario,
@@ -424,35 +405,29 @@ def stress_test(
424
  "successful": result.get("successful"),
425
  "failed": result.get("failed"),
426
  "avg_query_time_ms": result.get("avg_query_time_ms"),
427
- "batch_coherence": result.get(
428
- "batch_narrative_analysis",
429
- {}).get(
430
- "coherence_score",
431
- 0),
432
  }
433
 
434
  results_summary["scenarios"].append(scenario_result)
435
  results_summary["total_coherence_score"] += scenario_result["batch_coherence"]
436
 
437
- click.echo(
438
- f" {result.get('successful')}/{len(queries)} queries successful")
439
- click.echo(
440
- f" Coherence: {scenario_result['batch_coherence']:.3f}")
441
 
442
  except Exception as e:
443
  click.secho(f" ✗ Scenario failed: {str(e)}", fg="red")
444
 
445
  # Summary
446
- avg_coherence = results_summary["total_coherence_score"] / \
447
- max(1, num_scenarios)
448
  results_summary["average_coherence"] = avg_coherence
449
  results_summary["end_time"] = datetime.now().isoformat()
450
 
451
  click.echo("\n" + "=" * 60)
452
  click.echo("Stress Test Summary")
453
  click.echo("=" * 60)
454
- click.echo(
455
- f"Scenarios Completed: {len(results_summary['scenarios'])}/{num_scenarios}")
456
  click.echo(f"Average Coherence Score: {avg_coherence:.3f}")
457
  click.echo(f"Result: {'PASS ✓' if avg_coherence > 0.7 else 'FAIL ✗'}")
458
  click.echo()
 
38
 
39
  def single_query(self, query_data: Dict[str, Any]) -> Dict[str, Any]:
40
  """Execute single query"""
41
+ response = self.session.post(f"{self.base_url}/query", json=query_data, timeout=30)
 
42
  response.raise_for_status()
43
  return response.json()
44
 
45
+ def bulk_query(
46
+ self, queries: List[Dict[str, Any]], concurrency: int = 5, include_narrative: bool = False
47
+ ) -> Dict[str, Any]:
 
 
 
48
  """Execute bulk concurrent queries"""
49
  payload = {
50
  "queries": queries,
51
  "concurrency_level": concurrency,
52
  "include_narrative_analysis": include_narrative,
53
  }
54
+ response = self.session.post(f"{self.base_url}/bulk_query", json=payload, timeout=120)
 
55
  response.raise_for_status()
56
  return response.json()
57
 
 
63
 
64
  def reset_metrics(self) -> Dict[str, Any]:
65
  """Reset service metrics"""
66
+ response = self.session.post(f"{self.base_url}/metrics/reset", timeout=5)
 
67
  response.raise_for_status()
68
  return response.json()
69
 
70
 
71
  @click.group()
72
+ @click.option("--api-url", default="http://localhost:8000", help="API service URL")
 
 
73
  @click.pass_context
74
  def cli(ctx, api_url):
75
  """EXP-09 CLI - STAT7 Retrieval API Command Line Interface"""
 
98
  f" Concurrent Queries: {
99
  health_data.get(
100
  'concurrent_queries',
101
+ 0)}"
102
+ )
103
  click.echo(
104
  f" Max Concurrent Observed: {
105
  health_data.get(
106
  'max_concurrent_observed',
107
+ 0)}"
108
+ )
109
  click.echo(f" Hybrid Queries: {health_data.get('hybrid_queries', 0)}")
110
  click.echo(f" Errors: {health_data.get('errors', 0)}")
111
  else:
 
121
  @click.option("--hybrid", is_flag=True, help="Enable STAT7 hybrid scoring")
122
  @click.option("--max-results", default=10, help="Maximum results to return")
123
  @click.option("--confidence", default=0.6, help="Confidence threshold")
124
+ @click.option("--weight-semantic", default=0.6, help="Semantic weight in hybrid mode")
125
+ @click.option("--weight-stat7", default=0.4, help="STAT7 weight in hybrid mode")
 
 
126
  @click.option("--json-output", is_flag=True, help="Output as JSON")
127
  @click.pass_context
128
  def query(
 
165
  click.echo(f"Query: {result.get('query_id')}")
166
  click.echo("=" * 60)
167
  click.echo(f"Results: {result.get('result_count')}")
168
+ click.echo(f"Execution Time: {result.get('execution_time_ms'):.1f}ms")
 
169
 
170
  if result.get("semantic_similarity"):
171
  click.echo(
172
  f"Semantic Similarity: {
173
+ result.get('semantic_similarity'):.3f}"
174
+ )
175
  if result.get("stat7_resonance"):
176
+ click.echo(f"STAT7 Resonance: {result.get('stat7_resonance'):.3f}")
 
177
 
178
  # Show narrative analysis
179
  if result.get("narrative_analysis"):
180
  narr = result["narrative_analysis"]
181
  click.echo(f"\nNarrative Analysis:")
182
+ click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
183
+ click.echo(f" Narrative Threads: {narr.get('narrative_threads', 0)}")
 
 
184
  click.echo(f" Analysis: {narr.get('analysis')}")
185
 
186
  # Show results
187
+ click.echo(f"\nTop Results ({min(3, len(result.get('results', [])))}):")
 
188
  for i, res in enumerate(result.get("results", [])[:3], 1):
189
  click.echo(
190
  f" {i}. Score: {
 
194
  res.get(
195
  'content',
196
  'N/A')[
197
+ :50]}..."
198
+ )
199
 
200
  click.echo()
201
 
 
206
  @cli.command()
207
  @click.option("--num-queries", default=5, help="Number of concurrent queries")
208
  @click.option("--concurrency", default=5, help="Concurrency level")
209
+ @click.option("--semantic", multiple=True, help="Semantic queries (can specify multiple)")
210
+ @click.option("--hybrid", is_flag=True, help="Enable STAT7 hybrid for all queries")
 
 
211
  @click.option("--json-output", is_flag=True, help="Output as JSON")
212
  @click.pass_context
213
  def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
 
243
  try:
244
  click.echo(
245
  f"\nExecuting {
246
+ len(query_data)} concurrent queries (concurrency={concurrency})..."
247
+ )
248
  start_time = time.time()
249
 
250
+ result = client.bulk_query(query_data, concurrency=concurrency, include_narrative=True)
 
251
 
252
  elapsed = time.time() - start_time
253
 
 
263
  click.echo(f"Failed: {result.get('failed')} ✗")
264
  click.echo(
265
  f"Total Execution Time: {
266
+ result.get('execution_time_ms'):.1f}ms"
267
+ )
268
+ click.echo(f"Avg Query Time: {result.get('avg_query_time_ms'):.1f}ms")
269
 
270
  # Narrative analysis for entire batch
271
  if result.get("batch_narrative_analysis"):
272
  narr = result["batch_narrative_analysis"]
273
  click.echo(f"\nBatch Narrative Analysis:")
274
+ click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
 
275
  click.echo(
276
  f" Total Narrative Threads: {
277
  narr.get(
278
  'narrative_threads',
279
+ 0)}"
280
+ )
281
  click.echo(f" Total Results: {narr.get('result_count', 0)}")
282
  click.echo(f" Analysis: {narr.get('analysis')}")
283
 
 
288
  f" {
289
  res.get('query_id')}: {
290
  res.get('result_count')} results in {
291
+ res.get('execution_time_ms'):.1f}ms"
292
+ )
293
 
294
  click.echo()
295
 
 
317
  click.echo(f"Total Queries: {metrics_data.get('total_queries')}")
318
  click.echo(
319
  f"Concurrent Queries: {
320
+ metrics_data.get('concurrent_queries')}"
321
+ )
322
  click.echo(f"Max Concurrent: {metrics_data.get('max_concurrent')}")
323
  click.echo(f"Hybrid Queries: {metrics_data.get('hybrid_queries')}")
324
  click.echo(f"Errors: {metrics_data.get('errors')}")
 
343
 
344
 
345
  @cli.command()
346
+ @click.option("--num-scenarios", default=3, help="Number of concurrent test scenarios")
347
+ @click.option("--queries-per-scenario", default=10, help="Queries per scenario")
 
 
348
  @click.option("--use-hybrid", is_flag=True, help="Use STAT7 hybrid scoring")
349
  @click.option("--output-file", help="Save results to file")
350
  @click.pass_context
351
+ def stress_test(ctx, num_scenarios, queries_per_scenario, use_hybrid, output_file):
 
 
 
 
 
352
  """Run EXP-10 narrative preservation stress test"""
353
  client = ctx.obj["client"]
354
 
 
397
  )
398
 
399
  try:
400
+ result = client.bulk_query(queries, concurrency=10, include_narrative=True)
 
401
 
402
  scenario_result = {
403
  "scenario": scenario,
 
405
  "successful": result.get("successful"),
406
  "failed": result.get("failed"),
407
  "avg_query_time_ms": result.get("avg_query_time_ms"),
408
+ "batch_coherence": result.get("batch_narrative_analysis", {}).get(
409
+ "coherence_score", 0
410
+ ),
 
 
411
  }
412
 
413
  results_summary["scenarios"].append(scenario_result)
414
  results_summary["total_coherence_score"] += scenario_result["batch_coherence"]
415
 
416
+ click.echo(f" ✓ {result.get('successful')}/{len(queries)} queries successful")
417
+ click.echo(f" Coherence: {scenario_result['batch_coherence']:.3f}")
 
 
418
 
419
  except Exception as e:
420
  click.secho(f" ✗ Scenario failed: {str(e)}", fg="red")
421
 
422
  # Summary
423
+ avg_coherence = results_summary["total_coherence_score"] / max(1, num_scenarios)
 
424
  results_summary["average_coherence"] = avg_coherence
425
  results_summary["end_time"] = datetime.now().isoformat()
426
 
427
  click.echo("\n" + "=" * 60)
428
  click.echo("Stress Test Summary")
429
  click.echo("=" * 60)
430
+ click.echo(f"Scenarios Completed: {len(results_summary['scenarios'])}/{num_scenarios}")
 
431
  click.echo(f"Average Coherence Score: {avg_coherence:.3f}")
432
  click.echo(f"Result: {'PASS ✓' if avg_coherence > 0.7 else 'FAIL ✗'}")
433
  click.echo()
warbler_cda/api/service.py CHANGED
@@ -47,8 +47,7 @@ class STAT7Address(BaseModel):
47
  """STAT7 coordinate specification"""
48
 
49
  realm: Dict[str, Any] = Field(
50
- default_factory=lambda: {
51
- "type": "retrieval_query", "label": "api_query"}
52
  )
53
  lineage: int = 0
54
  adjacency: str = "semantic_proximity"
@@ -128,8 +127,7 @@ def _init_api():
128
  return _api_instance
129
 
130
 
131
- def _analyze_narrative_coherence(
132
- results: List[Dict[str, Any]]) -> Dict[str, Any]:
133
  """
134
  Analyze narrative coherence across results.
135
  Used to validate that meaning/story threads survive concurrent access.
@@ -179,12 +177,9 @@ def _analyze_narrative_coherence(
179
  relevance_scores.append(result.get("relevance_score", 0.0))
180
 
181
  # Calculate coherence components
182
- avg_semantic = sum(semantic_scores) / \
183
- len(semantic_scores) if semantic_scores else 0.0
184
- avg_stat7 = sum(stat7_resonances) / \
185
- len(stat7_resonances) if stat7_resonances else 0.0
186
- avg_relevance = sum(relevance_scores) / \
187
- len(relevance_scores) if relevance_scores else 0.0
188
 
189
  # 1. RESULT QUALITY (50% weight): Average relevance of all results
190
  # This is the primary signal - if results aren't relevant, nothing else
@@ -197,8 +192,7 @@ def _analyze_narrative_coherence(
197
  semantic_variance = sum((s - avg_semantic) ** 2 for s in semantic_scores) / max(
198
  1, len(semantic_scores)
199
  )
200
- semantic_coherence = 1.0 / \
201
- (1.0 + semantic_variance) if semantic_variance < 1.0 else 0.0
202
 
203
  # 3. STAT7 ENTANGLEMENT (10% weight): Are results connected in STAT7 space?
204
  stat7_coherence = avg_stat7
@@ -231,14 +225,18 @@ def _analyze_narrative_coherence(
231
  if len(results) > 50: # Only log for bulk operations
232
  logger.info(
233
  f"Coherence analysis for {
234
- len(results)} results: " f"quality={
 
235
  quality_score:.3f}, semantic_coh={
236
  semantic_coherence:.3f} (var={
237
- semantic_variance:.4f}), " f"stat7={
 
238
  stat7_coherence:.3f}, focus={
239
- focus_coherence:.3f}, " f"threads={
 
240
  len(narrative_threads)}, final={
241
- coherence_score:.3f}")
 
242
 
243
  return {
244
  "coherence_score": coherence_score,
@@ -306,7 +304,8 @@ async def _stress_test_result(
306
  if query.stat7_hybrid and query.semantic_query:
307
  logger.info(
308
  f"Bob Test 1: Pure semantic retrieval for query {
309
- query.query_id}")
 
310
  semantic_query = RetrievalQuery(
311
  query_id=f"{query.query_id}_bob_semantic",
312
  mode=RetrievalMode.SEMANTIC_SIMILARITY,
@@ -316,8 +315,7 @@ async def _stress_test_result(
316
  )
317
  semantic_assembly = api.retrieve_context(semantic_query)
318
  semantic_ids = set(r.content_id for r in semantic_assembly.results)
319
- semantic_overlap = len(
320
- original_ids & semantic_ids) / max(1, len(original_ids))
321
 
322
  log["tests_run"].append(
323
  {
@@ -329,8 +327,7 @@ async def _stress_test_result(
329
 
330
  # Test 2: Pure STAT7 retrieval (if hybrid was used)
331
  if query.stat7_hybrid and query.stat7_address:
332
- logger.info(
333
- f"Bob Test 2: Pure STAT7 retrieval for query {query.query_id}")
334
  stat7_query = RetrievalQuery(
335
  query_id=f"{query.query_id}_bob_stat7",
336
  mode=RetrievalMode.STAT7_ADDRESS,
@@ -340,8 +337,7 @@ async def _stress_test_result(
340
  )
341
  stat7_assembly = api.retrieve_context(stat7_query)
342
  stat7_ids = set(r.content_id for r in stat7_assembly.results)
343
- stat7_overlap = len(original_ids & stat7_ids) / \
344
- max(1, len(original_ids))
345
 
346
  log["tests_run"].append(
347
  {
@@ -356,21 +352,20 @@ async def _stress_test_result(
356
  if query.confidence_threshold < 0.8:
357
  logger.info(
358
  f"Bob Test 3: Higher confidence threshold for query {
359
- query.query_id}")
 
360
  high_conf_query = RetrievalQuery(
361
  query_id=f"{query.query_id}_bob_high_conf",
362
  mode=query.mode,
363
  semantic_query=query.semantic_query,
364
  anchor_ids=query.anchor_ids,
365
  max_results=query.max_results,
366
- confidence_threshold=min(
367
- 0.85, query.confidence_threshold + 0.2),
368
  stat7_hybrid=query.stat7_hybrid,
369
  stat7_address=query.stat7_address,
370
  )
371
  high_conf_assembly = api.retrieve_context(high_conf_query)
372
- high_conf_ids = set(
373
- r.content_id for r in high_conf_assembly.results)
374
  high_conf_overlap = len(original_ids & high_conf_ids) / max(
375
  1, min(len(original_ids), len(high_conf_ids))
376
  )
@@ -390,8 +385,7 @@ async def _stress_test_result(
390
  avg_consistency = sum(consistency_scores) / len(consistency_scores)
391
  log["consistency_score"] = avg_consistency
392
 
393
- if avg_consistency >= (
394
- 1.0 - BobSkepticConfig.STRESS_TEST_DIVERGENCE_THRESHOLD):
395
  log["verdict"] = "CONSISTENT"
396
  is_consistent = True
397
  else:
@@ -405,8 +399,7 @@ async def _stress_test_result(
405
  log["stress_test_completed"] = datetime.now().isoformat()
406
 
407
  except Exception as e:
408
- logger.error(
409
- f"Error during Bob's stress test for {query.query_id}: {str(e)}")
410
  log["error"] = str(e)
411
  log["verdict"] = "ERROR_DURING_TEST"
412
  is_consistent = False # Err on side of caution
@@ -454,19 +447,24 @@ async def _bob_skeptic_filter(
454
  # Results are verified despite low entanglement
455
  logger.info(
456
  f"✅ BOB VERIFIED: Query {
457
- query.query_id} is consistent across stress tests. " f"High coherence is genuine, not an artifact. (consistency={
 
458
  verification_log.get(
459
  'consistency_score',
460
- 0.0):.3f})")
 
461
  return "VERIFIED", verification_log
462
  else:
463
  # Results diverge under stress testing = quarantine
464
  logger.warning(
465
  f"🚨 BOB QUARANTINE: Query {
466
- query.query_id} FAILED stress tests. " f"High coherence appears to be artifact or dataset bias. (consistency={
 
467
  verification_log.get(
468
  'consistency_score',
469
- 0.0):.3f}) " f"Escalating to Faculty for review.")
 
 
470
  return "QUARANTINED", verification_log
471
 
472
  # Results are normal - no investigation needed
@@ -484,8 +482,7 @@ async def startup_event():
484
  async def health_check():
485
  """Health check endpoint"""
486
  api = _init_api()
487
- uptime = (datetime.now() -
488
- datetime.fromisoformat(_metrics["start_time"])).total_seconds()
489
 
490
  return HealthResponse(
491
  status="healthy",
@@ -504,8 +501,7 @@ async def single_query(request: QueryRequest):
504
  api = _init_api()
505
  _metrics["total_queries"] += 1
506
  _metrics["concurrent_queries"] += 1
507
- _metrics["max_concurrent"] = max(
508
- _metrics["max_concurrent"], _metrics["concurrent_queries"])
509
 
510
  if request.stat7_hybrid:
511
  _metrics["hybrid_queries"] += 1
@@ -514,8 +510,9 @@ async def single_query(request: QueryRequest):
514
  start_time = time.time()
515
 
516
  # Convert request to RetrievalQuery
517
- mode = RetrievalMode[request.mode.upper().replace(
518
- "_", "").replace("SIMILARITY", "_SIMILARITY")]
 
519
 
520
  stat7_addr = None
521
  if request.stat7_address:
@@ -571,11 +568,9 @@ async def single_query(request: QueryRequest):
571
  result_count=len(results_data),
572
  results=results_data,
573
  semantic_similarity=(
574
- results_data[0].get(
575
- "semantic_similarity") if results_data else None
576
  ),
577
- stat7_resonance=results_data[0].get(
578
- "stat7_resonance") if results_data else None,
579
  execution_time_ms=execution_time,
580
  timestamp=datetime.now().isoformat(),
581
  narrative_analysis=narrative_analysis,
@@ -600,7 +595,8 @@ async def bulk_concurrent_queries(request: BulkQueryRequest):
600
  f"Executing {
601
  len(
602
  request.queries)} queries with concurrency level {
603
- request.concurrency_level}")
 
604
 
605
  results = []
606
  semaphore = asyncio.Semaphore(request.concurrency_level)
@@ -615,8 +611,7 @@ async def bulk_concurrent_queries(request: BulkQueryRequest):
615
  batch_results = await asyncio.gather(*tasks, return_exceptions=True)
616
 
617
  # Separate successful results from errors
618
- successful_results = [
619
- r for r in batch_results if not isinstance(r, Exception)]
620
 
621
  errors = [
622
  {"query_id": request.queries[i].query_id, "error": str(r)}
@@ -629,8 +624,7 @@ async def bulk_concurrent_queries(request: BulkQueryRequest):
629
  for result in successful_results:
630
  all_results_flat.extend(result.results)
631
 
632
- batch_narrative_analysis = _analyze_narrative_coherence(
633
- all_results_flat)
634
 
635
  return {
636
  "batch_id": f"batch_{int(time.time() * 1000)}",
@@ -675,21 +669,20 @@ async def ingest_documents(request: Dict[str, Any]):
675
  continue
676
 
677
  # Use the new add_document method
678
- success = api.add_document(
679
- doc_id=content_id, content=content, metadata=metadata)
680
 
681
  if success:
682
  ingested += 1
683
  logger.info(f"✓ Ingested: {content_id}")
684
  else:
685
- failed.append(
686
- {"doc_id": content_id, "error": "Document already exists"})
687
  logger.warning(f"Document already exists: {content_id}")
688
 
689
  logger.info(
690
  f"Ingested {ingested}/{
691
  len(documents)} documents (context store now has {
692
- api.get_context_store_size()} total)")
 
693
 
694
  response = {
695
  "status": "success",
 
47
  """STAT7 coordinate specification"""
48
 
49
  realm: Dict[str, Any] = Field(
50
+ default_factory=lambda: {"type": "retrieval_query", "label": "api_query"}
 
51
  )
52
  lineage: int = 0
53
  adjacency: str = "semantic_proximity"
 
127
  return _api_instance
128
 
129
 
130
+ def _analyze_narrative_coherence(results: List[Dict[str, Any]]) -> Dict[str, Any]:
 
131
  """
132
  Analyze narrative coherence across results.
133
  Used to validate that meaning/story threads survive concurrent access.
 
177
  relevance_scores.append(result.get("relevance_score", 0.0))
178
 
179
  # Calculate coherence components
180
+ avg_semantic = sum(semantic_scores) / len(semantic_scores) if semantic_scores else 0.0
181
+ avg_stat7 = sum(stat7_resonances) / len(stat7_resonances) if stat7_resonances else 0.0
182
+ avg_relevance = sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0.0
 
 
 
183
 
184
  # 1. RESULT QUALITY (50% weight): Average relevance of all results
185
  # This is the primary signal - if results aren't relevant, nothing else
 
192
  semantic_variance = sum((s - avg_semantic) ** 2 for s in semantic_scores) / max(
193
  1, len(semantic_scores)
194
  )
195
+ semantic_coherence = 1.0 / (1.0 + semantic_variance) if semantic_variance < 1.0 else 0.0
 
196
 
197
  # 3. STAT7 ENTANGLEMENT (10% weight): Are results connected in STAT7 space?
198
  stat7_coherence = avg_stat7
 
225
  if len(results) > 50: # Only log for bulk operations
226
  logger.info(
227
  f"Coherence analysis for {
228
+ len(results)} results: "
229
+ f"quality={
230
  quality_score:.3f}, semantic_coh={
231
  semantic_coherence:.3f} (var={
232
+ semantic_variance:.4f}), "
233
+ f"stat7={
234
  stat7_coherence:.3f}, focus={
235
+ focus_coherence:.3f}, "
236
+ f"threads={
237
  len(narrative_threads)}, final={
238
+ coherence_score:.3f}"
239
+ )
240
 
241
  return {
242
  "coherence_score": coherence_score,
 
304
  if query.stat7_hybrid and query.semantic_query:
305
  logger.info(
306
  f"Bob Test 1: Pure semantic retrieval for query {
307
+ query.query_id}"
308
+ )
309
  semantic_query = RetrievalQuery(
310
  query_id=f"{query.query_id}_bob_semantic",
311
  mode=RetrievalMode.SEMANTIC_SIMILARITY,
 
315
  )
316
  semantic_assembly = api.retrieve_context(semantic_query)
317
  semantic_ids = set(r.content_id for r in semantic_assembly.results)
318
+ semantic_overlap = len(original_ids & semantic_ids) / max(1, len(original_ids))
 
319
 
320
  log["tests_run"].append(
321
  {
 
327
 
328
  # Test 2: Pure STAT7 retrieval (if hybrid was used)
329
  if query.stat7_hybrid and query.stat7_address:
330
+ logger.info(f"Bob Test 2: Pure STAT7 retrieval for query {query.query_id}")
 
331
  stat7_query = RetrievalQuery(
332
  query_id=f"{query.query_id}_bob_stat7",
333
  mode=RetrievalMode.STAT7_ADDRESS,
 
337
  )
338
  stat7_assembly = api.retrieve_context(stat7_query)
339
  stat7_ids = set(r.content_id for r in stat7_assembly.results)
340
+ stat7_overlap = len(original_ids & stat7_ids) / max(1, len(original_ids))
 
341
 
342
  log["tests_run"].append(
343
  {
 
352
  if query.confidence_threshold < 0.8:
353
  logger.info(
354
  f"Bob Test 3: Higher confidence threshold for query {
355
+ query.query_id}"
356
+ )
357
  high_conf_query = RetrievalQuery(
358
  query_id=f"{query.query_id}_bob_high_conf",
359
  mode=query.mode,
360
  semantic_query=query.semantic_query,
361
  anchor_ids=query.anchor_ids,
362
  max_results=query.max_results,
363
+ confidence_threshold=min(0.85, query.confidence_threshold + 0.2),
 
364
  stat7_hybrid=query.stat7_hybrid,
365
  stat7_address=query.stat7_address,
366
  )
367
  high_conf_assembly = api.retrieve_context(high_conf_query)
368
+ high_conf_ids = set(r.content_id for r in high_conf_assembly.results)
 
369
  high_conf_overlap = len(original_ids & high_conf_ids) / max(
370
  1, min(len(original_ids), len(high_conf_ids))
371
  )
 
385
  avg_consistency = sum(consistency_scores) / len(consistency_scores)
386
  log["consistency_score"] = avg_consistency
387
 
388
+ if avg_consistency >= (1.0 - BobSkepticConfig.STRESS_TEST_DIVERGENCE_THRESHOLD):
 
389
  log["verdict"] = "CONSISTENT"
390
  is_consistent = True
391
  else:
 
399
  log["stress_test_completed"] = datetime.now().isoformat()
400
 
401
  except Exception as e:
402
+ logger.error(f"Error during Bob's stress test for {query.query_id}: {str(e)}")
 
403
  log["error"] = str(e)
404
  log["verdict"] = "ERROR_DURING_TEST"
405
  is_consistent = False # Err on side of caution
 
447
  # Results are verified despite low entanglement
448
  logger.info(
449
  f"✅ BOB VERIFIED: Query {
450
+ query.query_id} is consistent across stress tests. "
451
+ f"High coherence is genuine, not an artifact. (consistency={
452
  verification_log.get(
453
  'consistency_score',
454
+ 0.0):.3f})"
455
+ )
456
  return "VERIFIED", verification_log
457
  else:
458
  # Results diverge under stress testing = quarantine
459
  logger.warning(
460
  f"🚨 BOB QUARANTINE: Query {
461
+ query.query_id} FAILED stress tests. "
462
+ f"High coherence appears to be artifact or dataset bias. (consistency={
463
  verification_log.get(
464
  'consistency_score',
465
+ 0.0):.3f}) "
466
+ f"Escalating to Faculty for review."
467
+ )
468
  return "QUARANTINED", verification_log
469
 
470
  # Results are normal - no investigation needed
 
482
  async def health_check():
483
  """Health check endpoint"""
484
  api = _init_api()
485
+ uptime = (datetime.now() - datetime.fromisoformat(_metrics["start_time"])).total_seconds()
 
486
 
487
  return HealthResponse(
488
  status="healthy",
 
501
  api = _init_api()
502
  _metrics["total_queries"] += 1
503
  _metrics["concurrent_queries"] += 1
504
+ _metrics["max_concurrent"] = max(_metrics["max_concurrent"], _metrics["concurrent_queries"])
 
505
 
506
  if request.stat7_hybrid:
507
  _metrics["hybrid_queries"] += 1
 
510
  start_time = time.time()
511
 
512
  # Convert request to RetrievalQuery
513
+ mode = RetrievalMode[
514
+ request.mode.upper().replace("_", "").replace("SIMILARITY", "_SIMILARITY")
515
+ ]
516
 
517
  stat7_addr = None
518
  if request.stat7_address:
 
568
  result_count=len(results_data),
569
  results=results_data,
570
  semantic_similarity=(
571
+ results_data[0].get("semantic_similarity") if results_data else None
 
572
  ),
573
+ stat7_resonance=results_data[0].get("stat7_resonance") if results_data else None,
 
574
  execution_time_ms=execution_time,
575
  timestamp=datetime.now().isoformat(),
576
  narrative_analysis=narrative_analysis,
 
595
  f"Executing {
596
  len(
597
  request.queries)} queries with concurrency level {
598
+ request.concurrency_level}"
599
+ )
600
 
601
  results = []
602
  semaphore = asyncio.Semaphore(request.concurrency_level)
 
611
  batch_results = await asyncio.gather(*tasks, return_exceptions=True)
612
 
613
  # Separate successful results from errors
614
+ successful_results = [r for r in batch_results if not isinstance(r, Exception)]
 
615
 
616
  errors = [
617
  {"query_id": request.queries[i].query_id, "error": str(r)}
 
624
  for result in successful_results:
625
  all_results_flat.extend(result.results)
626
 
627
+ batch_narrative_analysis = _analyze_narrative_coherence(all_results_flat)
 
628
 
629
  return {
630
  "batch_id": f"batch_{int(time.time() * 1000)}",
 
669
  continue
670
 
671
  # Use the new add_document method
672
+ success = api.add_document(doc_id=content_id, content=content, metadata=metadata)
 
673
 
674
  if success:
675
  ingested += 1
676
  logger.info(f"✓ Ingested: {content_id}")
677
  else:
678
+ failed.append({"doc_id": content_id, "error": "Document already exists"})
 
679
  logger.warning(f"Document already exists: {content_id}")
680
 
681
  logger.info(
682
  f"Ingested {ingested}/{
683
  len(documents)} documents (context store now has {
684
+ api.get_context_store_size()} total)"
685
+ )
686
 
687
  response = {
688
  "status": "success",
warbler_cda/conflict_detector.py CHANGED
@@ -65,22 +65,15 @@ class ConflictDetector:
65
  - Confidence scoring and evidence collection
66
  """
67
 
68
- def __init__(self,
69
- config: Optional[Dict[str,
70
- Any]] = None,
71
- embedding_provider=None):
72
  self.config = config or {}
73
  self.embedding_provider = embedding_provider
74
 
75
  # Configuration parameters
76
- self.opposition_threshold = self.config.get(
77
- "opposition_threshold", 0.7)
78
- self.semantic_similarity_threshold = self.config.get(
79
- "semantic_similarity_threshold", 0.8)
80
- self.min_confidence_score = self.config.get(
81
- "min_confidence_score", 0.6)
82
- self.max_statement_age_hours = self.config.get(
83
- "max_statement_age_hours", 24)
84
 
85
  # Storage
86
  self.statement_fingerprints: Dict[str, StatementFingerprint] = {}
@@ -152,8 +145,7 @@ class ConflictDetector:
152
  "average_confidence": 0.0,
153
  }
154
 
155
- def process_statements(
156
- self, statements: List[Dict[str, Any]]) -> Dict[str, Any]:
157
  """
158
  Process new statements and detect conflicts with existing statements.
159
 
@@ -169,10 +161,7 @@ class ConflictDetector:
169
  "new_conflicts": [],
170
  "fingerprints_created": 0,
171
  "total_active_statements": 0,
172
- "conflict_summary": {
173
- "high_confidence": 0,
174
- "medium_confidence": 0,
175
- "low_confidence": 0},
176
  }
177
 
178
  # Process each statement
@@ -184,8 +173,7 @@ class ConflictDetector:
184
  continue
185
 
186
  # Create fingerprint for new statement
187
- fingerprint = self._create_statement_fingerprint(
188
- statement_id, content, statement)
189
  self.statement_fingerprints[statement_id] = fingerprint
190
  processing_report["fingerprints_created"] += 1
191
 
@@ -203,7 +191,8 @@ class ConflictDetector:
203
  "conflict_type": conflict.conflict_type.value,
204
  "confidence_score": conflict.confidence_score,
205
  "opposition_indicators": conflict.opposition_indicators,
206
- })
 
207
 
208
  # Categorize by confidence
209
  if conflict.confidence_score >= 0.8:
@@ -219,8 +208,7 @@ class ConflictDetector:
219
  # Update metrics
220
  elapsed_ms = (time.time() - start_time) * 1000
221
  self.metrics["statements_processed"] += len(statements)
222
- self.metrics["conflicts_detected"] += len(
223
- processing_report["new_conflicts"])
224
  self.metrics["processing_time_ms"] += elapsed_ms
225
 
226
  if self.detected_conflicts:
@@ -229,10 +217,8 @@ class ConflictDetector:
229
  ) / len(self.detected_conflicts)
230
 
231
  processing_report["elapsed_ms"] = elapsed_ms
232
- processing_report["total_active_statements"] = len(
233
- self.statement_fingerprints)
234
- processing_report["total_conflicts_detected"] = len(
235
- self.detected_conflicts)
236
 
237
  return processing_report
238
 
@@ -263,8 +249,7 @@ class ConflictDetector:
263
 
264
  for conflict in conflicts_involving_statement:
265
  conflict_type = conflict.conflict_type.value
266
- conflict_types[conflict_type] = conflict_types.get(
267
- conflict_type, 0) + 1
268
  max_confidence = max(max_confidence, conflict.confidence_score)
269
 
270
  # Add opposing statement
@@ -324,8 +309,7 @@ class ConflictDetector:
324
  for conflict in self.detected_conflicts:
325
  # Count by type
326
  conflict_type = conflict.conflict_type.value
327
- conflict_types[conflict_type] = conflict_types.get(
328
- conflict_type, 0) + 1
329
 
330
  # Count by confidence
331
  if conflict.confidence_score >= 0.8:
@@ -351,8 +335,7 @@ class ConflictDetector:
351
  status = "healthy"
352
 
353
  health_score = self._calculate_health_score()
354
- recommendations = self._generate_system_recommendations(
355
- status, conflict_types)
356
 
357
  return {
358
  "total_conflicts": len(self.detected_conflicts),
@@ -402,16 +385,19 @@ class ConflictDetector:
402
  # Detect negation indicators
403
  content_lower = content.lower()
404
  negation_indicators = [
405
- pattern for pattern in self.negation_patterns if pattern in content_lower]
 
406
 
407
  # Calculate assertion strength
408
  assertion_indicators = [
409
- pattern for pattern in self.assertion_patterns if pattern in content_lower]
 
410
  assertion_strength = min(len(assertion_indicators) * 0.2, 1.0)
411
 
412
  # Extract temporal markers
413
  temporal_markers = [
414
- pattern for pattern in self.temporal_patterns if pattern in content_lower]
 
415
 
416
  # Extract domain tags (simple keyword-based)
417
  domain_tags = set()
@@ -467,8 +453,7 @@ class ConflictDetector:
467
  context_overlap = len(
468
  new_fingerprint.domain_tags & existing_fingerprint.domain_tags
469
  ) / max(
470
- len(new_fingerprint.domain_tags |
471
- existing_fingerprint.domain_tags), 1
472
  )
473
 
474
  # Collect opposition evidence
@@ -477,14 +462,12 @@ class ConflictDetector:
477
  new_fingerprint.negation_indicators
478
  and not existing_fingerprint.negation_indicators
479
  ):
480
- opposition_indicators.extend(
481
- new_fingerprint.negation_indicators)
482
  elif (
483
  existing_fingerprint.negation_indicators
484
  and not new_fingerprint.negation_indicators
485
  ):
486
- opposition_indicators.extend(
487
- existing_fingerprint.negation_indicators)
488
 
489
  # Determine conflict type
490
  conflict_type = self._determine_conflict_type(
@@ -493,7 +476,8 @@ class ConflictDetector:
493
 
494
  # Calculate confidence score
495
  confidence = self._calculate_confidence_score(
496
- similarity, opposition_score, context_overlap, opposition_indicators)
 
497
 
498
  if confidence >= self.min_confidence_score:
499
  conflict = ConflictEvidence(
@@ -530,13 +514,9 @@ class ConflictDetector:
530
  # Temporal conflicts
531
  if fp1.temporal_markers and fp2.temporal_markers:
532
  # Simple temporal conflict detection
533
- if any(
534
- marker in [
535
- "before",
536
- "earlier"] for marker in fp1.temporal_markers) and any(
537
- marker in [
538
- "after",
539
- "later"] for marker in fp2.temporal_markers):
540
  score += 0.3
541
 
542
  return min(score, 1.0)
@@ -570,8 +550,7 @@ class ConflictDetector:
570
  indicators: List[str],
571
  ) -> float:
572
  """Calculate confidence score for a conflict detection."""
573
- base_score = (similarity * 0.4) + \
574
- (opposition_score * 0.4) + (context_overlap * 0.2)
575
 
576
  # Boost confidence if we have clear opposition indicators
577
  indicator_boost = min(len(indicators) * 0.1, 0.2)
@@ -601,11 +580,10 @@ class ConflictDetector:
601
 
602
  def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
603
  """Generate unique ID for a conflict."""
604
- content = (
605
- f"{
606
  conflict.statement_a_id}_{
607
  conflict.statement_b_id}_{
608
- conflict.conflict_type.value}")
609
  return hashlib.md5(content.encode()).hexdigest()[:12]
610
 
611
  def _generate_conflict_recommendation(
@@ -629,12 +607,10 @@ class ConflictDetector:
629
  recommendations = []
630
 
631
  if status == "critical":
632
- recommendations.append(
633
- "Immediate review required - multiple high-confidence conflicts")
634
  recommendations.append("Consider statement validation workflow")
635
  elif status == "warning":
636
- recommendations.append(
637
- "Monitor conflicts closely - elevated conflict level")
638
  recommendations.append("Review recent statements for accuracy")
639
 
640
  # Type-specific recommendations
@@ -644,12 +620,10 @@ class ConflictDetector:
644
  )
645
 
646
  if conflict_types.get("temporal_conflict", 0) > 2:
647
- recommendations.append(
648
- "Temporal conflicts detected - verify timeline consistency")
649
 
650
  if not recommendations:
651
- recommendations.append(
652
- "System operating normally - continue monitoring")
653
 
654
  return recommendations
655
 
@@ -659,7 +633,8 @@ class ConflictDetector:
659
  return 1.0
660
 
661
  high_confidence_conflicts = sum(
662
- 1 for conflict in self.detected_conflicts if conflict.confidence_score > 0.8)
 
663
 
664
  total_statements = len(self.statement_fingerprints)
665
  if total_statements == 0:
 
65
  - Confidence scoring and evidence collection
66
  """
67
 
68
+ def __init__(self, config: Optional[Dict[str, Any]] = None, embedding_provider=None):
 
 
 
69
  self.config = config or {}
70
  self.embedding_provider = embedding_provider
71
 
72
  # Configuration parameters
73
+ self.opposition_threshold = self.config.get("opposition_threshold", 0.7)
74
+ self.semantic_similarity_threshold = self.config.get("semantic_similarity_threshold", 0.8)
75
+ self.min_confidence_score = self.config.get("min_confidence_score", 0.6)
76
+ self.max_statement_age_hours = self.config.get("max_statement_age_hours", 24)
 
 
 
 
77
 
78
  # Storage
79
  self.statement_fingerprints: Dict[str, StatementFingerprint] = {}
 
145
  "average_confidence": 0.0,
146
  }
147
 
148
+ def process_statements(self, statements: List[Dict[str, Any]]) -> Dict[str, Any]:
 
149
  """
150
  Process new statements and detect conflicts with existing statements.
151
 
 
161
  "new_conflicts": [],
162
  "fingerprints_created": 0,
163
  "total_active_statements": 0,
164
+ "conflict_summary": {"high_confidence": 0, "medium_confidence": 0, "low_confidence": 0},
 
 
 
165
  }
166
 
167
  # Process each statement
 
173
  continue
174
 
175
  # Create fingerprint for new statement
176
+ fingerprint = self._create_statement_fingerprint(statement_id, content, statement)
 
177
  self.statement_fingerprints[statement_id] = fingerprint
178
  processing_report["fingerprints_created"] += 1
179
 
 
191
  "conflict_type": conflict.conflict_type.value,
192
  "confidence_score": conflict.confidence_score,
193
  "opposition_indicators": conflict.opposition_indicators,
194
+ }
195
+ )
196
 
197
  # Categorize by confidence
198
  if conflict.confidence_score >= 0.8:
 
208
  # Update metrics
209
  elapsed_ms = (time.time() - start_time) * 1000
210
  self.metrics["statements_processed"] += len(statements)
211
+ self.metrics["conflicts_detected"] += len(processing_report["new_conflicts"])
 
212
  self.metrics["processing_time_ms"] += elapsed_ms
213
 
214
  if self.detected_conflicts:
 
217
  ) / len(self.detected_conflicts)
218
 
219
  processing_report["elapsed_ms"] = elapsed_ms
220
+ processing_report["total_active_statements"] = len(self.statement_fingerprints)
221
+ processing_report["total_conflicts_detected"] = len(self.detected_conflicts)
 
 
222
 
223
  return processing_report
224
 
 
249
 
250
  for conflict in conflicts_involving_statement:
251
  conflict_type = conflict.conflict_type.value
252
+ conflict_types[conflict_type] = conflict_types.get(conflict_type, 0) + 1
 
253
  max_confidence = max(max_confidence, conflict.confidence_score)
254
 
255
  # Add opposing statement
 
309
  for conflict in self.detected_conflicts:
310
  # Count by type
311
  conflict_type = conflict.conflict_type.value
312
+ conflict_types[conflict_type] = conflict_types.get(conflict_type, 0) + 1
 
313
 
314
  # Count by confidence
315
  if conflict.confidence_score >= 0.8:
 
335
  status = "healthy"
336
 
337
  health_score = self._calculate_health_score()
338
+ recommendations = self._generate_system_recommendations(status, conflict_types)
 
339
 
340
  return {
341
  "total_conflicts": len(self.detected_conflicts),
 
385
  # Detect negation indicators
386
  content_lower = content.lower()
387
  negation_indicators = [
388
+ pattern for pattern in self.negation_patterns if pattern in content_lower
389
+ ]
390
 
391
  # Calculate assertion strength
392
  assertion_indicators = [
393
+ pattern for pattern in self.assertion_patterns if pattern in content_lower
394
+ ]
395
  assertion_strength = min(len(assertion_indicators) * 0.2, 1.0)
396
 
397
  # Extract temporal markers
398
  temporal_markers = [
399
+ pattern for pattern in self.temporal_patterns if pattern in content_lower
400
+ ]
401
 
402
  # Extract domain tags (simple keyword-based)
403
  domain_tags = set()
 
453
  context_overlap = len(
454
  new_fingerprint.domain_tags & existing_fingerprint.domain_tags
455
  ) / max(
456
+ len(new_fingerprint.domain_tags | existing_fingerprint.domain_tags), 1
 
457
  )
458
 
459
  # Collect opposition evidence
 
462
  new_fingerprint.negation_indicators
463
  and not existing_fingerprint.negation_indicators
464
  ):
465
+ opposition_indicators.extend(new_fingerprint.negation_indicators)
 
466
  elif (
467
  existing_fingerprint.negation_indicators
468
  and not new_fingerprint.negation_indicators
469
  ):
470
+ opposition_indicators.extend(existing_fingerprint.negation_indicators)
 
471
 
472
  # Determine conflict type
473
  conflict_type = self._determine_conflict_type(
 
476
 
477
  # Calculate confidence score
478
  confidence = self._calculate_confidence_score(
479
+ similarity, opposition_score, context_overlap, opposition_indicators
480
+ )
481
 
482
  if confidence >= self.min_confidence_score:
483
  conflict = ConflictEvidence(
 
514
  # Temporal conflicts
515
  if fp1.temporal_markers and fp2.temporal_markers:
516
  # Simple temporal conflict detection
517
+ if any(marker in ["before", "earlier"] for marker in fp1.temporal_markers) and any(
518
+ marker in ["after", "later"] for marker in fp2.temporal_markers
519
+ ):
 
 
 
 
520
  score += 0.3
521
 
522
  return min(score, 1.0)
 
550
  indicators: List[str],
551
  ) -> float:
552
  """Calculate confidence score for a conflict detection."""
553
+ base_score = (similarity * 0.4) + (opposition_score * 0.4) + (context_overlap * 0.2)
 
554
 
555
  # Boost confidence if we have clear opposition indicators
556
  indicator_boost = min(len(indicators) * 0.1, 0.2)
 
580
 
581
  def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
582
  """Generate unique ID for a conflict."""
583
+ content = f"{
 
584
  conflict.statement_a_id}_{
585
  conflict.statement_b_id}_{
586
+ conflict.conflict_type.value}"
587
  return hashlib.md5(content.encode()).hexdigest()[:12]
588
 
589
  def _generate_conflict_recommendation(
 
607
  recommendations = []
608
 
609
  if status == "critical":
610
+ recommendations.append("Immediate review required - multiple high-confidence conflicts")
 
611
  recommendations.append("Consider statement validation workflow")
612
  elif status == "warning":
613
+ recommendations.append("Monitor conflicts closely - elevated conflict level")
 
614
  recommendations.append("Review recent statements for accuracy")
615
 
616
  # Type-specific recommendations
 
620
  )
621
 
622
  if conflict_types.get("temporal_conflict", 0) > 2:
623
+ recommendations.append("Temporal conflicts detected - verify timeline consistency")
 
624
 
625
  if not recommendations:
626
+ recommendations.append("System operating normally - continue monitoring")
 
627
 
628
  return recommendations
629
 
 
633
  return 1.0
634
 
635
  high_confidence_conflicts = sum(
636
+ 1 for conflict in self.detected_conflicts if conflict.confidence_score > 0.8
637
+ )
638
 
639
  total_statements = len(self.statement_fingerprints)
640
  if total_statements == 0:
warbler_cda/evaporation.py CHANGED
@@ -9,8 +9,7 @@ from collections import Counter
9
  class EvaporationEngine:
10
  """Evaporation: converts molten glyphs into mist lines (proto-thoughts) with advanced style bias."""
11
 
12
- def __init__(self, magma_store, cloud_store,
13
- config: Optional[Dict[str, Any]] = None):
14
  self.magma_store = magma_store
15
  self.cloud_store = cloud_store
16
  self.config = config or {}
@@ -23,15 +22,12 @@ class EvaporationEngine:
23
  # Language generation parameters
24
  self.creativity_level = self.config.get("creativity_level", 0.7)
25
  self.compression_ratio = self.config.get("compression_ratio", 0.6)
26
- self.mythic_amplification = self.config.get(
27
- "mythic_amplification", 1.2)
28
 
29
  # Advanced distillation parameters
30
- self.semantic_density_threshold = self.config.get(
31
- "semantic_density_threshold", 0.5)
32
  self.affect_sensitivity = self.config.get("affect_sensitivity", 0.8)
33
- self.temporal_decay_factor = self.config.get(
34
- "temporal_decay_factor", 0.1)
35
 
36
  def evaporate(
37
  self, limit: int = 5, style_override: Optional[str] = None
@@ -43,8 +39,7 @@ class EvaporationEngine:
43
  mist_lines = []
44
  for i, glyph in enumerate(molten[:limit]):
45
  # Apply style variation for diversity
46
- current_style = style_override or self._determine_style_for_glyph(
47
- glyph, i)
48
 
49
  # Advanced mist distillation
50
  mist = self._advanced_distill_mist(glyph, current_style)
@@ -62,11 +57,9 @@ class EvaporationEngine:
62
 
63
  return mist_lines
64
 
65
- def _select_optimal_glyphs(
66
- self, target_count: int) -> List[Dict[str, Any]]:
67
  """Select optimal glyphs for evaporation based on multiple criteria."""
68
- molten = self.magma_store.select_hot(
69
- target_count * 3) # Get more candidates
70
 
71
  # Score glyphs based on multiple factors
72
  scored_glyphs = []
@@ -80,8 +73,7 @@ class EvaporationEngine:
80
 
81
  # Affect diversity factor
82
  affect = glyph.get("affect", {})
83
- affect_score = sum(abs(v)
84
- for v in affect.values()) / max(len(affect), 1)
85
  score += affect_score * 0.3
86
 
87
  # Temporal freshness factor
@@ -100,8 +92,7 @@ class EvaporationEngine:
100
  scored_glyphs.sort(key=lambda x: x[1], reverse=True)
101
  return [glyph for glyph, _ in scored_glyphs[:target_count]]
102
 
103
- def _determine_style_for_glyph(
104
- self, glyph: Dict[str, Any], index: int) -> str:
105
  """Determine optimal style for a specific glyph."""
106
  affect = glyph.get("affect", {})
107
  heat = glyph.get("heat", 0.0)
@@ -128,27 +119,22 @@ class EvaporationEngine:
128
 
129
  return base_style
130
 
131
- def _advanced_distill_mist(
132
- self, glyph: Dict[str, Any], style: str) -> Dict[str, Any]:
133
  """Advanced mist distillation with style bias."""
134
  summary = glyph.get("compressed_summary", "")
135
  affect = glyph.get("affect", {})
136
  heat = glyph.get("heat", 0.0)
137
 
138
  # Get style profile
139
- style_profile = self.style_profiles.get(
140
- style, self.style_profiles["balanced"])
141
 
142
  # Generate proto-thought with style bias
143
- proto_thought = self._generate_styled_proto_thought(
144
- summary, affect, style_profile)
145
 
146
  # Calculate advanced metrics
147
- evaporation_temp = self._calculate_evaporation_temperature(
148
- heat, affect)
149
  technical_clarity = self._calculate_technical_clarity(summary, style)
150
- mythic_weight = self._calculate_mythic_weight(
151
- affect, style) * self.mythic_amplification
152
 
153
  # Create enhanced mist line
154
  mist_line = {
@@ -212,15 +198,13 @@ class EvaporationEngine:
212
 
213
  return concepts[:5] # Limit to top 5 concepts
214
 
215
- def _apply_poetic_style(
216
- self, concepts: List[str], affect: Dict[str, Any]) -> str:
217
  """Apply poetic style to proto-thought."""
218
  if not concepts:
219
  return "[Poetic] Ethereal mist of untold stories..."
220
 
221
  # Poetic connectors and imagery
222
- poetic_connectors = ["whispers", "dreams",
223
- "echoes", "shadows", "light", "flow"]
224
  poetic_imagery = [
225
  "through ancient corridors",
226
  "across starlit paths",
@@ -238,8 +222,7 @@ class EvaporationEngine:
238
  concepts[0]} and {
239
  concepts[1]} {connector} {imagery}."
240
 
241
- def _apply_technical_style(
242
- self, concepts: List[str], affect: Dict[str, Any]) -> str:
243
  """Apply technical style to proto-thought."""
244
  if not concepts:
245
  return "[Technical] System processing: null input detected."
@@ -251,8 +234,7 @@ class EvaporationEngine:
251
  "Architecture review:",
252
  "Implementation note:",
253
  ]
254
- tech_connectors = ["enables", "facilitates",
255
- "optimizes", "integrates", "synchronizes"]
256
 
257
  prefix = random.choice(tech_prefixes)
258
  connector = random.choice(tech_connectors)
@@ -264,8 +246,7 @@ class EvaporationEngine:
264
  concepts[0]} {connector} {
265
  concepts[1]} subsystem."
266
 
267
- def _apply_narrative_style(
268
- self, concepts: List[str], affect: Dict[str, Any]) -> str:
269
  """Apply narrative style to proto-thought."""
270
  if not concepts:
271
  return "[Narrative] Once upon a time, in the realm of forgotten ideas..."
@@ -277,8 +258,7 @@ class EvaporationEngine:
277
  "Beyond the horizon,",
278
  "Within the tapestry of",
279
  ]
280
- narrative_actions = ["emerges", "dances",
281
- "whispers", "journeys", "transforms"]
282
 
283
  opener = random.choice(narrative_openers)
284
  action = random.choice(narrative_actions)
@@ -290,8 +270,7 @@ class EvaporationEngine:
290
  concepts[0]} and {
291
  concepts[1]} {action} together."
292
 
293
- def _apply_mythic_style(
294
- self, concepts: List[str], affect: Dict[str, Any]) -> str:
295
  """Apply mythic style to proto-thought."""
296
  if not concepts:
297
  return "[Mythic] From the primordial void, legends are born..."
@@ -320,8 +299,7 @@ class EvaporationEngine:
320
  concepts[0]} and {
321
  concepts[1]} within {entity}."
322
 
323
- def _apply_balanced_style(
324
- self, concepts: List[str], affect: Dict[str, Any]) -> str:
325
  """Apply balanced style to proto-thought."""
326
  if not concepts:
327
  return "[Balanced] Contemplation on the nature of existence..."
@@ -330,13 +308,11 @@ class EvaporationEngine:
330
  return f"[Balanced] Reflection on {
331
  concepts[0]} reveals deeper meaning."
332
  else:
333
- return (
334
- f"[Balanced] The interplay between {
335
  concepts[0]} and {
336
- concepts[1]} creates harmony.")
337
 
338
- def _apply_affect_coloring(
339
- self, proto_thought: str, affect: Dict[str, Any]) -> str:
340
  """Apply affect-based coloring to proto-thought."""
341
  if not affect:
342
  return proto_thought
@@ -367,15 +343,13 @@ class EvaporationEngine:
367
  if len(words) > 15:
368
  # Keep first, middle, and last parts
369
  keep_first = words[:5]
370
- keep_middle = words[len(words) // 2 - 2: len(words) // 2 + 2]
371
  keep_last = words[-3:]
372
- compressed = keep_first + ["..."] + \
373
- keep_middle + ["..."] + keep_last
374
  return " ".join(compressed)
375
  return proto_thought
376
 
377
- def _calculate_advanced_humidity(
378
- self, mist_lines: List[Dict[str, Any]]) -> float:
379
  """Calculate sophisticated humidity based on multiple factors."""
380
  if not mist_lines:
381
  return 0.0
@@ -390,10 +364,8 @@ class EvaporationEngine:
390
  mythic_contribution = avg_mythic * 0.3
391
 
392
  # Technical clarity contribution (inverse relationship)
393
- avg_clarity = sum(m.get("technical_clarity", 0.5)
394
- for m in mist_lines) / len(mist_lines)
395
- clarity_contribution = (1.0 - avg_clarity) * \
396
- 0.1 # Less clear = more humid
397
 
398
  # Style diversity contribution
399
  styles = [m.get("style", "balanced") for m in mist_lines]
@@ -426,10 +398,8 @@ class EvaporationEngine:
426
  return
427
 
428
  # Analyze mist characteristics
429
- avg_mythic = sum(m.get("mythic_weight", 0.0)
430
- for m in mist_lines) / len(mist_lines)
431
- avg_clarity = sum(m.get("technical_clarity", 0.5)
432
- for m in mist_lines) / len(mist_lines)
433
  styles = [m.get("style", "balanced") for m in mist_lines]
434
  style_diversity = len(set(styles)) / len(styles) if styles else 0
435
 
@@ -471,14 +441,12 @@ class EvaporationEngine:
471
 
472
  return min(1.0, density * length_factor)
473
 
474
- def _calculate_evaporation_temperature(
475
- self, heat: float, affect: Dict[str, Any]) -> float:
476
  """Calculate evaporation temperature based on heat and affect."""
477
  base_temp = heat * 0.8 # Primary factor from heat
478
 
479
  # Affect modulation
480
- affect_intensity = sum(abs(v)
481
- for v in affect.values()) / max(len(affect), 1)
482
  affect_modulation = affect_intensity * 0.2
483
 
484
  # Combine and bound
@@ -508,17 +476,14 @@ class EvaporationEngine:
508
  if summary:
509
  # Normalize by expected length
510
  complexity = len(summary.split()) / 20.0
511
- complexity_adjustment = max(-0.2,
512
- min(0.2, (0.5 - complexity) * 0.4))
513
  clarity += complexity_adjustment
514
 
515
  return min(1.0, max(0.1, clarity))
516
 
517
- def _calculate_mythic_weight(
518
- self, affect: Dict[str, Any], style: str) -> float:
519
  """Calculate mythic weight based on affect and style."""
520
- base_weight = affect.get("awe", 0.0) * 0.6 + \
521
- affect.get("wonder", 0.0) * 0.4
522
 
523
  # Style multipliers
524
  style_multipliers = {
@@ -536,8 +501,7 @@ class EvaporationEngine:
536
  multiplier = style_multipliers.get(style, 1.0)
537
  return min(1.0, base_weight * multiplier)
538
 
539
- def _create_affect_signature(
540
- self, affect: Dict[str, Any]) -> Dict[str, float]:
541
  """Create normalized affect signature."""
542
  if not affect:
543
  return {}
@@ -574,8 +538,7 @@ class EvaporationEngine:
574
  proto_concepts = set(self._extract_key_concepts(proto_thought))
575
 
576
  if original_concepts:
577
- preservation = len(original_concepts &
578
- proto_concepts) / len(original_concepts)
579
  quality_score += preservation * 0.2
580
 
581
  # Affect alignment
 
9
  class EvaporationEngine:
10
  """Evaporation: converts molten glyphs into mist lines (proto-thoughts) with advanced style bias."""
11
 
12
+ def __init__(self, magma_store, cloud_store, config: Optional[Dict[str, Any]] = None):
 
13
  self.magma_store = magma_store
14
  self.cloud_store = cloud_store
15
  self.config = config or {}
 
22
  # Language generation parameters
23
  self.creativity_level = self.config.get("creativity_level", 0.7)
24
  self.compression_ratio = self.config.get("compression_ratio", 0.6)
25
+ self.mythic_amplification = self.config.get("mythic_amplification", 1.2)
 
26
 
27
  # Advanced distillation parameters
28
+ self.semantic_density_threshold = self.config.get("semantic_density_threshold", 0.5)
 
29
  self.affect_sensitivity = self.config.get("affect_sensitivity", 0.8)
30
+ self.temporal_decay_factor = self.config.get("temporal_decay_factor", 0.1)
 
31
 
32
  def evaporate(
33
  self, limit: int = 5, style_override: Optional[str] = None
 
39
  mist_lines = []
40
  for i, glyph in enumerate(molten[:limit]):
41
  # Apply style variation for diversity
42
+ current_style = style_override or self._determine_style_for_glyph(glyph, i)
 
43
 
44
  # Advanced mist distillation
45
  mist = self._advanced_distill_mist(glyph, current_style)
 
57
 
58
  return mist_lines
59
 
60
+ def _select_optimal_glyphs(self, target_count: int) -> List[Dict[str, Any]]:
 
61
  """Select optimal glyphs for evaporation based on multiple criteria."""
62
+ molten = self.magma_store.select_hot(target_count * 3) # Get more candidates
 
63
 
64
  # Score glyphs based on multiple factors
65
  scored_glyphs = []
 
73
 
74
  # Affect diversity factor
75
  affect = glyph.get("affect", {})
76
+ affect_score = sum(abs(v) for v in affect.values()) / max(len(affect), 1)
 
77
  score += affect_score * 0.3
78
 
79
  # Temporal freshness factor
 
92
  scored_glyphs.sort(key=lambda x: x[1], reverse=True)
93
  return [glyph for glyph, _ in scored_glyphs[:target_count]]
94
 
95
+ def _determine_style_for_glyph(self, glyph: Dict[str, Any], index: int) -> str:
 
96
  """Determine optimal style for a specific glyph."""
97
  affect = glyph.get("affect", {})
98
  heat = glyph.get("heat", 0.0)
 
119
 
120
  return base_style
121
 
122
+ def _advanced_distill_mist(self, glyph: Dict[str, Any], style: str) -> Dict[str, Any]:
 
123
  """Advanced mist distillation with style bias."""
124
  summary = glyph.get("compressed_summary", "")
125
  affect = glyph.get("affect", {})
126
  heat = glyph.get("heat", 0.0)
127
 
128
  # Get style profile
129
+ style_profile = self.style_profiles.get(style, self.style_profiles["balanced"])
 
130
 
131
  # Generate proto-thought with style bias
132
+ proto_thought = self._generate_styled_proto_thought(summary, affect, style_profile)
 
133
 
134
  # Calculate advanced metrics
135
+ evaporation_temp = self._calculate_evaporation_temperature(heat, affect)
 
136
  technical_clarity = self._calculate_technical_clarity(summary, style)
137
+ mythic_weight = self._calculate_mythic_weight(affect, style) * self.mythic_amplification
 
138
 
139
  # Create enhanced mist line
140
  mist_line = {
 
198
 
199
  return concepts[:5] # Limit to top 5 concepts
200
 
201
+ def _apply_poetic_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
 
202
  """Apply poetic style to proto-thought."""
203
  if not concepts:
204
  return "[Poetic] Ethereal mist of untold stories..."
205
 
206
  # Poetic connectors and imagery
207
+ poetic_connectors = ["whispers", "dreams", "echoes", "shadows", "light", "flow"]
 
208
  poetic_imagery = [
209
  "through ancient corridors",
210
  "across starlit paths",
 
222
  concepts[0]} and {
223
  concepts[1]} {connector} {imagery}."
224
 
225
+ def _apply_technical_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
 
226
  """Apply technical style to proto-thought."""
227
  if not concepts:
228
  return "[Technical] System processing: null input detected."
 
234
  "Architecture review:",
235
  "Implementation note:",
236
  ]
237
+ tech_connectors = ["enables", "facilitates", "optimizes", "integrates", "synchronizes"]
 
238
 
239
  prefix = random.choice(tech_prefixes)
240
  connector = random.choice(tech_connectors)
 
246
  concepts[0]} {connector} {
247
  concepts[1]} subsystem."
248
 
249
+ def _apply_narrative_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
 
250
  """Apply narrative style to proto-thought."""
251
  if not concepts:
252
  return "[Narrative] Once upon a time, in the realm of forgotten ideas..."
 
258
  "Beyond the horizon,",
259
  "Within the tapestry of",
260
  ]
261
+ narrative_actions = ["emerges", "dances", "whispers", "journeys", "transforms"]
 
262
 
263
  opener = random.choice(narrative_openers)
264
  action = random.choice(narrative_actions)
 
270
  concepts[0]} and {
271
  concepts[1]} {action} together."
272
 
273
+ def _apply_mythic_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
 
274
  """Apply mythic style to proto-thought."""
275
  if not concepts:
276
  return "[Mythic] From the primordial void, legends are born..."
 
299
  concepts[0]} and {
300
  concepts[1]} within {entity}."
301
 
302
+ def _apply_balanced_style(self, concepts: List[str], affect: Dict[str, Any]) -> str:
 
303
  """Apply balanced style to proto-thought."""
304
  if not concepts:
305
  return "[Balanced] Contemplation on the nature of existence..."
 
308
  return f"[Balanced] Reflection on {
309
  concepts[0]} reveals deeper meaning."
310
  else:
311
+ return f"[Balanced] The interplay between {
 
312
  concepts[0]} and {
313
+ concepts[1]} creates harmony."
314
 
315
+ def _apply_affect_coloring(self, proto_thought: str, affect: Dict[str, Any]) -> str:
 
316
  """Apply affect-based coloring to proto-thought."""
317
  if not affect:
318
  return proto_thought
 
343
  if len(words) > 15:
344
  # Keep first, middle, and last parts
345
  keep_first = words[:5]
346
+ keep_middle = words[len(words) // 2 - 2 : len(words) // 2 + 2]
347
  keep_last = words[-3:]
348
+ compressed = keep_first + ["..."] + keep_middle + ["..."] + keep_last
 
349
  return " ".join(compressed)
350
  return proto_thought
351
 
352
+ def _calculate_advanced_humidity(self, mist_lines: List[Dict[str, Any]]) -> float:
 
353
  """Calculate sophisticated humidity based on multiple factors."""
354
  if not mist_lines:
355
  return 0.0
 
364
  mythic_contribution = avg_mythic * 0.3
365
 
366
  # Technical clarity contribution (inverse relationship)
367
+ avg_clarity = sum(m.get("technical_clarity", 0.5) for m in mist_lines) / len(mist_lines)
368
+ clarity_contribution = (1.0 - avg_clarity) * 0.1 # Less clear = more humid
 
 
369
 
370
  # Style diversity contribution
371
  styles = [m.get("style", "balanced") for m in mist_lines]
 
398
  return
399
 
400
  # Analyze mist characteristics
401
+ avg_mythic = sum(m.get("mythic_weight", 0.0) for m in mist_lines) / len(mist_lines)
402
+ avg_clarity = sum(m.get("technical_clarity", 0.5) for m in mist_lines) / len(mist_lines)
 
 
403
  styles = [m.get("style", "balanced") for m in mist_lines]
404
  style_diversity = len(set(styles)) / len(styles) if styles else 0
405
 
 
441
 
442
  return min(1.0, density * length_factor)
443
 
444
+ def _calculate_evaporation_temperature(self, heat: float, affect: Dict[str, Any]) -> float:
 
445
  """Calculate evaporation temperature based on heat and affect."""
446
  base_temp = heat * 0.8 # Primary factor from heat
447
 
448
  # Affect modulation
449
+ affect_intensity = sum(abs(v) for v in affect.values()) / max(len(affect), 1)
 
450
  affect_modulation = affect_intensity * 0.2
451
 
452
  # Combine and bound
 
476
  if summary:
477
  # Normalize by expected length
478
  complexity = len(summary.split()) / 20.0
479
+ complexity_adjustment = max(-0.2, min(0.2, (0.5 - complexity) * 0.4))
 
480
  clarity += complexity_adjustment
481
 
482
  return min(1.0, max(0.1, clarity))
483
 
484
+ def _calculate_mythic_weight(self, affect: Dict[str, Any], style: str) -> float:
 
485
  """Calculate mythic weight based on affect and style."""
486
+ base_weight = affect.get("awe", 0.0) * 0.6 + affect.get("wonder", 0.0) * 0.4
 
487
 
488
  # Style multipliers
489
  style_multipliers = {
 
501
  multiplier = style_multipliers.get(style, 1.0)
502
  return min(1.0, base_weight * multiplier)
503
 
504
+ def _create_affect_signature(self, affect: Dict[str, Any]) -> Dict[str, float]:
 
505
  """Create normalized affect signature."""
506
  if not affect:
507
  return {}
 
538
  proto_concepts = set(self._extract_key_concepts(proto_thought))
539
 
540
  if original_concepts:
541
+ preservation = len(original_concepts & proto_concepts) / len(original_concepts)
 
542
  quality_score += preservation * 0.2
543
 
544
  # Affect alignment
warbler_cda/pack_loader.py CHANGED
@@ -37,14 +37,12 @@ class PackLoader:
37
 
38
  pack_docs = self._load_pack(pack_dir, pack_name)
39
  documents.extend(pack_docs)
40
- logger.info(
41
- f"✓ Loaded {len(pack_docs)} documents from {pack_name}")
42
 
43
  self.documents = documents
44
  return documents
45
 
46
- def _load_pack(self, pack_dir: Path,
47
- pack_name: str) -> List[Dict[str, Any]]:
48
  """Load documents from a specific pack"""
49
  documents = []
50
 
@@ -61,11 +59,7 @@ class PackLoader:
61
 
62
  return documents
63
 
64
- def _is_valid_warbler_pack(
65
- self,
66
- pack_dir: Path,
67
- pack_name: str,
68
- jsonl_file: Path) -> bool:
69
  """Validate that a directory is a valid Warbler pack
70
 
71
  A valid Warbler pack must have:
@@ -105,8 +99,7 @@ class PackLoader:
105
  # For chunked packs, look for chunk files
106
  chunk_files = list(pack_dir.glob(f"{pack_name}-chunk-*.jsonl"))
107
  if chunk_files:
108
- logger.debug(
109
- f"Found {len(chunk_files)} chunk files for {pack_name}")
110
  return True
111
  else:
112
  logger.warning(f"Chunked pack {pack_name} has no chunk files")
@@ -116,12 +109,10 @@ class PackLoader:
116
  if jsonl_file.exists():
117
  return True
118
  else:
119
- logger.warning(
120
- f"Single-file pack {pack_name} missing JSONL file: {jsonl_file}")
121
  return False
122
 
123
- def _load_jsonl_pack(self, pack_dir: Path,
124
- pack_name: str) -> List[Dict[str, Any]]:
125
  """Load JSONL-based pack (supports both single-file and chunked packs)"""
126
  documents = []
127
 
@@ -135,11 +126,9 @@ class PackLoader:
135
  with open(package_json, "r", encoding="utf-8") as f:
136
  metadata = json.load(f)
137
  is_chunked = metadata.get("chunked", False)
138
- chunk_pattern = metadata.get(
139
- "chunk_pattern", f"{pack_name}-chunk-*.jsonl")
140
  except (json.JSONDecodeError, IOError) as e:
141
- logger.warning(
142
- f"Could not read package.json for {pack_name}: {e}")
143
 
144
  if is_chunked:
145
  # Load chunked pack
@@ -152,12 +141,10 @@ class PackLoader:
152
  chunk_files = sorted(pack_dir.glob(f"{pack_name}-chunk-*.jsonl"))
153
 
154
  if not chunk_files:
155
- logger.warning(
156
- f"No chunk files found for chunked pack {pack_name}")
157
  return documents
158
 
159
- logger.info(
160
- f"Found {len(chunk_files)} chunk files for {pack_name}")
161
 
162
  # Load each chunk file in order
163
  for chunk_file in chunk_files:
@@ -168,7 +155,8 @@ class PackLoader:
168
  logger.info(
169
  f"Loaded {
170
  len(documents)} total documents from {
171
- len(chunk_files)} chunks")
 
172
  else:
173
  # Load single-file pack (backward compatibility)
174
  jsonl_file = pack_dir / f"{pack_name}.jsonl"
@@ -180,8 +168,7 @@ class PackLoader:
180
 
181
  return documents
182
 
183
- def _load_jsonl_file(self, jsonl_file: Path,
184
- pack_name: str) -> List[Dict[str, Any]]:
185
  """Load a single JSONL file with robust error handling"""
186
  documents = []
187
  error_count = 0
@@ -196,8 +183,11 @@ class PackLoader:
196
  try:
197
  entry = json.loads(line)
198
  doc = self._format_document(
199
- entry, pack_name, f"{
200
- jsonl_file.stem}_line_{line_num}")
 
 
 
201
  documents.append(doc)
202
  except json.JSONDecodeError as e:
203
  error_count += 1
@@ -205,7 +195,8 @@ class PackLoader:
205
  if error_count <= max_errors_to_log:
206
  logger.warning(
207
  f"Error parsing line {line_num} in {
208
- jsonl_file.name}: {e}")
 
209
  # Continue processing other lines instead of failing
210
  continue
211
 
@@ -213,14 +204,14 @@ class PackLoader:
213
  logger.info(
214
  f"Loaded {
215
  len(documents)} documents from {
216
- jsonl_file.name} ({error_count} lines skipped due to errors)")
 
217
  except Exception as e:
218
  logger.error(f"Error loading JSONL file {jsonl_file}: {e}")
219
 
220
  return documents
221
 
222
- def _load_structured_pack(self, pack_dir: Path,
223
- pack_name: str) -> List[Dict[str, Any]]:
224
  """Load structured pack with templates"""
225
  documents = []
226
 
@@ -233,8 +224,7 @@ class PackLoader:
233
  with open(templates_file, "r", encoding="utf-8") as f:
234
  data = json.load(f)
235
 
236
- templates = data if isinstance(
237
- data, list) else data.get("templates", [])
238
 
239
  for template in templates:
240
  doc = {
@@ -260,8 +250,7 @@ class PackLoader:
260
  self, entry: Dict[str, Any], pack_name: str, doc_id: str
261
  ) -> Dict[str, Any]:
262
  """Format a pack entry into a document"""
263
- content = entry.get("content") or entry.get(
264
- "text") or json.dumps(entry)
265
 
266
  return {
267
  "id": f"{pack_name}/{doc_id}",
 
37
 
38
  pack_docs = self._load_pack(pack_dir, pack_name)
39
  documents.extend(pack_docs)
40
+ logger.info(f"✓ Loaded {len(pack_docs)} documents from {pack_name}")
 
41
 
42
  self.documents = documents
43
  return documents
44
 
45
+ def _load_pack(self, pack_dir: Path, pack_name: str) -> List[Dict[str, Any]]:
 
46
  """Load documents from a specific pack"""
47
  documents = []
48
 
 
59
 
60
  return documents
61
 
62
+ def _is_valid_warbler_pack(self, pack_dir: Path, pack_name: str, jsonl_file: Path) -> bool:
 
 
 
 
63
  """Validate that a directory is a valid Warbler pack
64
 
65
  A valid Warbler pack must have:
 
99
  # For chunked packs, look for chunk files
100
  chunk_files = list(pack_dir.glob(f"{pack_name}-chunk-*.jsonl"))
101
  if chunk_files:
102
+ logger.debug(f"Found {len(chunk_files)} chunk files for {pack_name}")
 
103
  return True
104
  else:
105
  logger.warning(f"Chunked pack {pack_name} has no chunk files")
 
109
  if jsonl_file.exists():
110
  return True
111
  else:
112
+ logger.warning(f"Single-file pack {pack_name} missing JSONL file: {jsonl_file}")
 
113
  return False
114
 
115
+ def _load_jsonl_pack(self, pack_dir: Path, pack_name: str) -> List[Dict[str, Any]]:
 
116
  """Load JSONL-based pack (supports both single-file and chunked packs)"""
117
  documents = []
118
 
 
126
  with open(package_json, "r", encoding="utf-8") as f:
127
  metadata = json.load(f)
128
  is_chunked = metadata.get("chunked", False)
129
+ chunk_pattern = metadata.get("chunk_pattern", f"{pack_name}-chunk-*.jsonl")
 
130
  except (json.JSONDecodeError, IOError) as e:
131
+ logger.warning(f"Could not read package.json for {pack_name}: {e}")
 
132
 
133
  if is_chunked:
134
  # Load chunked pack
 
141
  chunk_files = sorted(pack_dir.glob(f"{pack_name}-chunk-*.jsonl"))
142
 
143
  if not chunk_files:
144
+ logger.warning(f"No chunk files found for chunked pack {pack_name}")
 
145
  return documents
146
 
147
+ logger.info(f"Found {len(chunk_files)} chunk files for {pack_name}")
 
148
 
149
  # Load each chunk file in order
150
  for chunk_file in chunk_files:
 
155
  logger.info(
156
  f"Loaded {
157
  len(documents)} total documents from {
158
+ len(chunk_files)} chunks"
159
+ )
160
  else:
161
  # Load single-file pack (backward compatibility)
162
  jsonl_file = pack_dir / f"{pack_name}.jsonl"
 
168
 
169
  return documents
170
 
171
+ def _load_jsonl_file(self, jsonl_file: Path, pack_name: str) -> List[Dict[str, Any]]:
 
172
  """Load a single JSONL file with robust error handling"""
173
  documents = []
174
  error_count = 0
 
183
  try:
184
  entry = json.loads(line)
185
  doc = self._format_document(
186
+ entry,
187
+ pack_name,
188
+ f"{
189
+ jsonl_file.stem}_line_{line_num}",
190
+ )
191
  documents.append(doc)
192
  except json.JSONDecodeError as e:
193
  error_count += 1
 
195
  if error_count <= max_errors_to_log:
196
  logger.warning(
197
  f"Error parsing line {line_num} in {
198
+ jsonl_file.name}: {e}"
199
+ )
200
  # Continue processing other lines instead of failing
201
  continue
202
 
 
204
  logger.info(
205
  f"Loaded {
206
  len(documents)} documents from {
207
+ jsonl_file.name} ({error_count} lines skipped due to errors)"
208
+ )
209
  except Exception as e:
210
  logger.error(f"Error loading JSONL file {jsonl_file}: {e}")
211
 
212
  return documents
213
 
214
+ def _load_structured_pack(self, pack_dir: Path, pack_name: str) -> List[Dict[str, Any]]:
 
215
  """Load structured pack with templates"""
216
  documents = []
217
 
 
224
  with open(templates_file, "r", encoding="utf-8") as f:
225
  data = json.load(f)
226
 
227
+ templates = data if isinstance(data, list) else data.get("templates", [])
 
228
 
229
  for template in templates:
230
  doc = {
 
250
  self, entry: Dict[str, Any], pack_name: str, doc_id: str
251
  ) -> Dict[str, Any]:
252
  """Format a pack entry into a document"""
253
+ content = entry.get("content") or entry.get("text") or json.dumps(entry)
 
254
 
255
  return {
256
  "id": f"{pack_name}/{doc_id}",
warbler_cda/retrieval_api.py CHANGED
@@ -31,8 +31,7 @@ class RetrievalQuery:
31
  mode: RetrievalMode
32
  anchor_ids: Optional[List[str]] = None
33
  semantic_query: Optional[str] = None
34
- temporal_range: Optional[Tuple[float, float]
35
- ] = None # (start_time, end_time)
36
  max_results: int = 10
37
  confidence_threshold: float = 0.6
38
  exclude_conflicts: bool = True
@@ -129,17 +128,13 @@ class RetrievalAPI:
129
  self.quality_threshold = self.config.get("quality_threshold", 0.6)
130
 
131
  # STAT7 hybrid scoring configuration
132
- self.enable_stat7_hybrid = self.config.get(
133
- "enable_stat7_hybrid", False)
134
- self.default_weight_semantic = self.config.get(
135
- "default_weight_semantic", 0.6)
136
- self.default_weight_stat7 = self.config.get(
137
- "default_weight_stat7", 0.4)
138
 
139
  # Retrieval cache (for performance)
140
  self.query_cache: Dict[str, ContextAssembly] = {}
141
- self.cache_ttl_seconds = self.config.get(
142
- "cache_ttl_seconds", 300) # 5 minutes
143
 
144
  # Document STAT7 assignments cache (for rapid re-retrieval)
145
  self.document_stat7_cache: Dict[str, Dict[str, Any]] = {}
@@ -158,8 +153,7 @@ class RetrievalAPI:
158
  "quality_distribution": {"high": 0, "medium": 0, "low": 0},
159
  }
160
 
161
- def retrieve_context(
162
- self, query: Union[RetrievalQuery, Dict[str, Any]]) -> ContextAssembly:
163
  """
164
  Main retrieval method - assemble context based on query.
165
 
@@ -243,10 +237,7 @@ class RetrievalAPI:
243
  assembly = self.retrieve_context(query)
244
  return assembly.results
245
 
246
- def get_anchor_context(
247
- self,
248
- anchor_id: str,
249
- context_radius: int = 3) -> ContextAssembly:
250
  """
251
  Get context around a specific anchor.
252
 
@@ -266,10 +257,7 @@ class RetrievalAPI:
266
 
267
  return self.retrieve_context(query)
268
 
269
- def trace_provenance(
270
- self,
271
- content_id: str,
272
- max_depth: int = 5) -> ContextAssembly:
273
  """
274
  Trace provenance chain for a piece of content.
275
 
@@ -327,10 +315,12 @@ class RetrievalAPI:
327
  if embedding:
328
  doc_entry["embedding"] = embedding
329
 
330
- if (stat7_coordinates is None and embedding and hasattr(
331
- self.embedding_provider, "compute_stat7_from_embedding")):
332
- stat7_coordinates = self.embedding_provider.compute_stat7_from_embedding(
333
- embedding)
 
 
334
 
335
  if stat7_coordinates:
336
  doc_entry["stat7_coordinates"] = stat7_coordinates
@@ -367,22 +357,17 @@ class RetrievalAPI:
367
  anchor_ids=query_dict.get("anchor_ids"),
368
  semantic_query=query_dict.get("semantic_query"),
369
  temporal_range=query_dict.get("temporal_range"),
370
- max_results=query_dict.get(
371
- "max_results", self.default_max_results),
372
  confidence_threshold=query_dict.get("confidence_threshold", 0.6),
373
  exclude_conflicts=query_dict.get("exclude_conflicts", True),
374
  include_provenance=query_dict.get("include_provenance", True),
375
- stat7_hybrid=query_dict.get(
376
- "stat7_hybrid", self.enable_stat7_hybrid),
377
  stat7_address=query_dict.get("stat7_address"),
378
- weight_semantic=query_dict.get(
379
- "weight_semantic", self.default_weight_semantic),
380
- weight_stat7=query_dict.get(
381
- "weight_stat7", self.default_weight_stat7),
382
  )
383
 
384
- def _retrieve_semantic_similarity(
385
- self, query: RetrievalQuery) -> List[RetrievalResult]:
386
  """Retrieve content based on semantic similarity."""
387
  results = []
388
 
@@ -403,15 +388,13 @@ class RetrievalAPI:
403
  self.semantic_anchors}",
404
  file=sys.stderr,
405
  )
406
- print(
407
- f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
408
 
409
  # If embedding provider available, use it
410
  if self.embedding_provider:
411
  # Get query embedding
412
  try:
413
- query_embedding = self.embedding_provider.embed_text(
414
- query.semantic_query)
415
  except Exception:
416
  return results
417
 
@@ -431,8 +414,8 @@ class RetrievalAPI:
431
  content=anchor.concept_text,
432
  relevance_score=similarity,
433
  temporal_distance=self._calculate_temporal_distance(
434
- anchor.provenance.first_seen,
435
- query.query_timestamp),
436
  anchor_connections=[anchor_id],
437
  provenance_depth=1,
438
  conflict_flags=[],
@@ -461,8 +444,8 @@ class RetrievalAPI:
461
  content=micro.compressed_text,
462
  relevance_score=similarity,
463
  temporal_distance=self._calculate_temporal_distance(
464
- micro.creation_timestamp,
465
- query.query_timestamp),
466
  anchor_connections=[],
467
  provenance_depth=2,
468
  conflict_flags=[],
@@ -481,9 +464,7 @@ class RetrievalAPI:
481
 
482
  return results
483
 
484
- def _search_context_store(
485
- self,
486
- query: RetrievalQuery) -> List[RetrievalResult]:
487
  """
488
  Search context store using embeddings (semantic) or keyword fallback.
489
  Prefers embedding-based semantic search when available.
@@ -494,16 +475,14 @@ class RetrievalAPI:
494
  return results
495
 
496
  try:
497
- if self.embedding_provider and hasattr(
498
- self.embedding_provider, "semantic_search"):
499
  return self._search_context_store_semantic(query)
500
  except Exception as e:
501
  pass
502
 
503
  return self._search_context_store_keyword(query)
504
 
505
- def _search_context_store_semantic(
506
- self, query: RetrievalQuery) -> List[RetrievalResult]:
507
  """Search context store using semantic embeddings."""
508
  results = []
509
 
@@ -540,10 +519,8 @@ class RetrievalAPI:
540
  hybrid_score = sim_score
541
  if query.stat7_hybrid:
542
  hybrid_score = (
543
- query.weight_semantic *
544
- sim_score +
545
- query.weight_stat7 *
546
- stat7_resonance)
547
 
548
  result = RetrievalResult(
549
  result_id=f"ctx_{doc_id}",
@@ -565,8 +542,7 @@ class RetrievalAPI:
565
 
566
  return results
567
 
568
- def _search_context_store_keyword(
569
- self, query: RetrievalQuery) -> List[RetrievalResult]:
570
  """Fallback keyword-based search of context store."""
571
  results = []
572
 
@@ -613,21 +589,16 @@ class RetrievalAPI:
613
  return 0.5
614
 
615
  try:
616
- lineage_dist = abs(doc_stat7.get("lineage", 0.5) -
617
- query_stat7.get("lineage", 0.5))
618
  adjacency_dist = abs(
619
- doc_stat7.get("adjacency", 0.5) -
620
- query_stat7.get("adjacency", 0.5)
621
  )
622
  luminosity_dist = abs(
623
- doc_stat7.get("luminosity", 0.7) -
624
- query_stat7.get("luminosity", 0.7)
625
  )
626
- polarity_dist = abs(doc_stat7.get(
627
- "polarity", 0.5) - query_stat7.get("polarity", 0.5))
628
  dimensionality_dist = abs(
629
- doc_stat7.get("dimensionality", 0.5) -
630
- query_stat7.get("dimensionality", 0.5)
631
  )
632
 
633
  avg_distance = (
@@ -643,8 +614,7 @@ class RetrievalAPI:
643
  except Exception:
644
  return 0.5
645
 
646
- def _retrieve_temporal_sequence(
647
- self, query: RetrievalQuery) -> List[RetrievalResult]:
648
  """Retrieve content based on temporal sequence."""
649
  results = []
650
 
@@ -664,15 +634,15 @@ class RetrievalAPI:
664
  for anchor_id, anchor in self.semantic_anchors.anchors.items():
665
  if temporal_range[0] <= anchor.provenance.first_seen <= temporal_range[1]:
666
  temporal_items.append(
667
- ("anchor", anchor_id, anchor.provenance.first_seen, anchor))
 
668
 
669
  # Add micro-summaries
670
  if self.summarization_ladder:
671
  for micro in self.summarization_ladder.micro_summaries:
672
  if temporal_range[0] <= micro.creation_timestamp <= temporal_range[1]:
673
  temporal_items.append(
674
- ("micro_summary", micro.summary_id,
675
- micro.creation_timestamp, micro)
676
  )
677
 
678
  # Sort by timestamp
@@ -711,15 +681,13 @@ class RetrievalAPI:
711
  anchor_connections=[],
712
  provenance_depth=2,
713
  conflict_flags=[],
714
- metadata={"timestamp": timestamp,
715
- "window_size": micro.window_size},
716
  )
717
  results.append(result)
718
 
719
  return results
720
 
721
- def _retrieve_anchor_neighborhood(
722
- self, query: RetrievalQuery) -> List[RetrievalResult]:
723
  """Retrieve content in the neighborhood of specific anchors."""
724
  results = []
725
 
@@ -756,15 +724,13 @@ class RetrievalAPI:
756
  anchor_connections=[anchor_id, other_id],
757
  provenance_depth=1,
758
  conflict_flags=[],
759
- metadata={"neighbor_of": anchor_id,
760
- "similarity": similarity},
761
  )
762
  results.append(result)
763
 
764
  return results
765
 
766
- def _retrieve_provenance_chain(
767
- self, query: RetrievalQuery) -> List[RetrievalResult]:
768
  """Retrieve content following provenance relationships."""
769
  results = []
770
 
@@ -795,8 +761,7 @@ class RetrievalAPI:
795
  results.append(result)
796
 
797
  # Add related content from update history
798
- for i, update in enumerate(
799
- anchor.provenance.update_history):
800
  if i >= query.max_results - 1:
801
  break
802
 
@@ -806,20 +771,17 @@ class RetrievalAPI:
806
  content_id=f"{anchor_id}_update_{i}",
807
  content=f"Update: {update.get('context', {}).get('mist_id', 'unknown')}",
808
  relevance_score=0.8 - (i * 0.1),
809
- temporal_distance=abs(
810
- update["timestamp"] - query.query_timestamp),
811
  anchor_connections=[anchor_id],
812
  provenance_depth=i + 1,
813
  conflict_flags=[],
814
- metadata={
815
- "update_context": update.get("context", {})},
816
  )
817
  results.append(result)
818
 
819
  return results
820
 
821
- def _retrieve_conflict_aware(
822
- self, query: RetrievalQuery) -> List[RetrievalResult]:
823
  """Retrieve content while avoiding conflicts."""
824
  # First get base results
825
  base_results = self._retrieve_semantic_similarity(query)
@@ -835,14 +797,14 @@ class RetrievalAPI:
835
 
836
  # Check for conflicts involving this content
837
  if hasattr(self.conflict_detector, "get_conflict_analysis"):
838
- conflict_analysis = self.conflict_detector.get_conflict_analysis(
839
- result.content_id)
840
  if conflict_analysis.get("conflicts_found", 0) > 0:
841
  conflicts = [
842
  f"conflict_confidence_{
843
  conflict_analysis.get(
844
  'max_confidence',
845
- 0):.2f}"]
 
846
 
847
  # Include result but flag conflicts
848
  result.conflict_flags = conflicts
@@ -851,9 +813,7 @@ class RetrievalAPI:
851
 
852
  return filtered_results
853
 
854
- def _retrieve_composite(
855
- self,
856
- query: RetrievalQuery) -> List[RetrievalResult]:
857
  """Retrieve using multiple modes and combine results."""
858
  all_results = []
859
 
@@ -896,15 +856,13 @@ class RetrievalAPI:
896
  self.metrics["hybrid_queries"] += 1
897
 
898
  # Filter by confidence threshold
899
- filtered = [r for r in results if r.relevance_score >=
900
- query.confidence_threshold]
901
 
902
  # Apply temporal decay
903
  current_time = query.query_timestamp
904
  for result in filtered:
905
  age_hours = result.temporal_distance / 3600
906
- decay_factor = max(
907
- 0.1, 1.0 - (age_hours / self.temporal_decay_hours))
908
  result.relevance_score *= decay_factor
909
 
910
  # Sort by relevance score
@@ -937,8 +895,7 @@ class RetrievalAPI:
937
  # Temporal span
938
  timestamps = [r.temporal_distance for r in results]
939
  temporal_span_hours = (
940
- (max(timestamps) - min(timestamps)) /
941
- 3600 if len(timestamps) > 1 else 0
942
  )
943
 
944
  # Anchor coverage
@@ -968,17 +925,11 @@ class RetrievalAPI:
968
  retrieval_timestamp=time.time(),
969
  )
970
 
971
- def _calculate_temporal_distance(
972
- self,
973
- timestamp: float,
974
- reference_time: float) -> float:
975
  """Calculate temporal distance between two timestamps."""
976
  return abs(timestamp - reference_time)
977
 
978
- def _calculate_temporal_relevance(
979
- self,
980
- timestamp: float,
981
- reference_time: float) -> float:
982
  """Calculate relevance based on temporal proximity."""
983
  distance_seconds = abs(timestamp - reference_time)
984
  distance_hours = distance_seconds / 3600
@@ -1060,14 +1011,12 @@ class RetrievalAPI:
1060
  def _update_metrics(self, assembly: ContextAssembly, elapsed_ms: float):
1061
  """Update performance metrics."""
1062
  self.metrics["average_results_per_query"] = (
1063
- self.metrics["average_results_per_query"] *
1064
- (self.metrics["total_queries"] - 1)
1065
  + len(assembly.results)
1066
  ) / self.metrics["total_queries"]
1067
 
1068
  self.metrics["average_retrieval_time_ms"] = (
1069
- self.metrics["average_retrieval_time_ms"] *
1070
- (self.metrics["total_queries"] - 1)
1071
  + elapsed_ms
1072
  ) / self.metrics["total_queries"]
1073
 
@@ -1081,8 +1030,7 @@ class RetrievalAPI:
1081
 
1082
  def _calculate_cache_hit_rate(self) -> float:
1083
  """Calculate cache hit rate."""
1084
- total_requests = self.metrics["cache_hits"] + \
1085
- self.metrics["cache_misses"]
1086
  if total_requests == 0:
1087
  return 0.0
1088
  return self.metrics["cache_hits"] / total_requests
@@ -1228,8 +1176,8 @@ class RetrievalAPI:
1228
  try:
1229
  q_stat7_dict = query.stat7_address
1230
  query_realm = Realm(
1231
- type=q_stat7_dict["realm"]["type"],
1232
- label=q_stat7_dict["realm"]["label"])
1233
  query_stat7 = STAT7Address(
1234
  realm=query_realm,
1235
  lineage=q_stat7_dict["lineage"],
@@ -1259,8 +1207,8 @@ class RetrievalAPI:
1259
 
1260
  try:
1261
  doc_realm = Realm(
1262
- type=doc_stat7_dict["realm"]["type"],
1263
- label=doc_stat7_dict["realm"]["label"])
1264
  doc_stat7 = STAT7Address(
1265
  realm=doc_realm,
1266
  lineage=doc_stat7_dict["lineage"],
@@ -1275,8 +1223,7 @@ class RetrievalAPI:
1275
  continue
1276
 
1277
  # Compute STAT7 resonance score
1278
- stat7_res = self.stat7_bridge.stat7_resonance(
1279
- query_stat7, doc_stat7)
1280
  result.stat7_resonance = stat7_res
1281
 
1282
  # Compute semantic similarity (if available)
@@ -1284,8 +1231,7 @@ class RetrievalAPI:
1284
  result.semantic_similarity = semantic_sim
1285
 
1286
  # Combine into hybrid score
1287
- hybrid = (query.weight_semantic * semantic_sim) + \
1288
- (query.weight_stat7 * stat7_res)
1289
  result.relevance_score = max(0.0, min(hybrid, 1.0))
1290
 
1291
  return results
 
31
  mode: RetrievalMode
32
  anchor_ids: Optional[List[str]] = None
33
  semantic_query: Optional[str] = None
34
+ temporal_range: Optional[Tuple[float, float]] = None # (start_time, end_time)
 
35
  max_results: int = 10
36
  confidence_threshold: float = 0.6
37
  exclude_conflicts: bool = True
 
128
  self.quality_threshold = self.config.get("quality_threshold", 0.6)
129
 
130
  # STAT7 hybrid scoring configuration
131
+ self.enable_stat7_hybrid = self.config.get("enable_stat7_hybrid", False)
132
+ self.default_weight_semantic = self.config.get("default_weight_semantic", 0.6)
133
+ self.default_weight_stat7 = self.config.get("default_weight_stat7", 0.4)
 
 
 
134
 
135
  # Retrieval cache (for performance)
136
  self.query_cache: Dict[str, ContextAssembly] = {}
137
+ self.cache_ttl_seconds = self.config.get("cache_ttl_seconds", 300) # 5 minutes
 
138
 
139
  # Document STAT7 assignments cache (for rapid re-retrieval)
140
  self.document_stat7_cache: Dict[str, Dict[str, Any]] = {}
 
153
  "quality_distribution": {"high": 0, "medium": 0, "low": 0},
154
  }
155
 
156
+ def retrieve_context(self, query: Union[RetrievalQuery, Dict[str, Any]]) -> ContextAssembly:
 
157
  """
158
  Main retrieval method - assemble context based on query.
159
 
 
237
  assembly = self.retrieve_context(query)
238
  return assembly.results
239
 
240
+ def get_anchor_context(self, anchor_id: str, context_radius: int = 3) -> ContextAssembly:
 
 
 
241
  """
242
  Get context around a specific anchor.
243
 
 
257
 
258
  return self.retrieve_context(query)
259
 
260
+ def trace_provenance(self, content_id: str, max_depth: int = 5) -> ContextAssembly:
 
 
 
261
  """
262
  Trace provenance chain for a piece of content.
263
 
 
315
  if embedding:
316
  doc_entry["embedding"] = embedding
317
 
318
+ if (
319
+ stat7_coordinates is None
320
+ and embedding
321
+ and hasattr(self.embedding_provider, "compute_stat7_from_embedding")
322
+ ):
323
+ stat7_coordinates = self.embedding_provider.compute_stat7_from_embedding(embedding)
324
 
325
  if stat7_coordinates:
326
  doc_entry["stat7_coordinates"] = stat7_coordinates
 
357
  anchor_ids=query_dict.get("anchor_ids"),
358
  semantic_query=query_dict.get("semantic_query"),
359
  temporal_range=query_dict.get("temporal_range"),
360
+ max_results=query_dict.get("max_results", self.default_max_results),
 
361
  confidence_threshold=query_dict.get("confidence_threshold", 0.6),
362
  exclude_conflicts=query_dict.get("exclude_conflicts", True),
363
  include_provenance=query_dict.get("include_provenance", True),
364
+ stat7_hybrid=query_dict.get("stat7_hybrid", self.enable_stat7_hybrid),
 
365
  stat7_address=query_dict.get("stat7_address"),
366
+ weight_semantic=query_dict.get("weight_semantic", self.default_weight_semantic),
367
+ weight_stat7=query_dict.get("weight_stat7", self.default_weight_stat7),
 
 
368
  )
369
 
370
+ def _retrieve_semantic_similarity(self, query: RetrievalQuery) -> List[RetrievalResult]:
 
371
  """Retrieve content based on semantic similarity."""
372
  results = []
373
 
 
388
  self.semantic_anchors}",
389
  file=sys.stderr,
390
  )
391
+ print(f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
 
392
 
393
  # If embedding provider available, use it
394
  if self.embedding_provider:
395
  # Get query embedding
396
  try:
397
+ query_embedding = self.embedding_provider.embed_text(query.semantic_query)
 
398
  except Exception:
399
  return results
400
 
 
414
  content=anchor.concept_text,
415
  relevance_score=similarity,
416
  temporal_distance=self._calculate_temporal_distance(
417
+ anchor.provenance.first_seen, query.query_timestamp
418
+ ),
419
  anchor_connections=[anchor_id],
420
  provenance_depth=1,
421
  conflict_flags=[],
 
444
  content=micro.compressed_text,
445
  relevance_score=similarity,
446
  temporal_distance=self._calculate_temporal_distance(
447
+ micro.creation_timestamp, query.query_timestamp
448
+ ),
449
  anchor_connections=[],
450
  provenance_depth=2,
451
  conflict_flags=[],
 
464
 
465
  return results
466
 
467
+ def _search_context_store(self, query: RetrievalQuery) -> List[RetrievalResult]:
 
 
468
  """
469
  Search context store using embeddings (semantic) or keyword fallback.
470
  Prefers embedding-based semantic search when available.
 
475
  return results
476
 
477
  try:
478
+ if self.embedding_provider and hasattr(self.embedding_provider, "semantic_search"):
 
479
  return self._search_context_store_semantic(query)
480
  except Exception as e:
481
  pass
482
 
483
  return self._search_context_store_keyword(query)
484
 
485
+ def _search_context_store_semantic(self, query: RetrievalQuery) -> List[RetrievalResult]:
 
486
  """Search context store using semantic embeddings."""
487
  results = []
488
 
 
519
  hybrid_score = sim_score
520
  if query.stat7_hybrid:
521
  hybrid_score = (
522
+ query.weight_semantic * sim_score + query.weight_stat7 * stat7_resonance
523
+ )
 
 
524
 
525
  result = RetrievalResult(
526
  result_id=f"ctx_{doc_id}",
 
542
 
543
  return results
544
 
545
+ def _search_context_store_keyword(self, query: RetrievalQuery) -> List[RetrievalResult]:
 
546
  """Fallback keyword-based search of context store."""
547
  results = []
548
 
 
589
  return 0.5
590
 
591
  try:
592
+ lineage_dist = abs(doc_stat7.get("lineage", 0.5) - query_stat7.get("lineage", 0.5))
 
593
  adjacency_dist = abs(
594
+ doc_stat7.get("adjacency", 0.5) - query_stat7.get("adjacency", 0.5)
 
595
  )
596
  luminosity_dist = abs(
597
+ doc_stat7.get("luminosity", 0.7) - query_stat7.get("luminosity", 0.7)
 
598
  )
599
+ polarity_dist = abs(doc_stat7.get("polarity", 0.5) - query_stat7.get("polarity", 0.5))
 
600
  dimensionality_dist = abs(
601
+ doc_stat7.get("dimensionality", 0.5) - query_stat7.get("dimensionality", 0.5)
 
602
  )
603
 
604
  avg_distance = (
 
614
  except Exception:
615
  return 0.5
616
 
617
+ def _retrieve_temporal_sequence(self, query: RetrievalQuery) -> List[RetrievalResult]:
 
618
  """Retrieve content based on temporal sequence."""
619
  results = []
620
 
 
634
  for anchor_id, anchor in self.semantic_anchors.anchors.items():
635
  if temporal_range[0] <= anchor.provenance.first_seen <= temporal_range[1]:
636
  temporal_items.append(
637
+ ("anchor", anchor_id, anchor.provenance.first_seen, anchor)
638
+ )
639
 
640
  # Add micro-summaries
641
  if self.summarization_ladder:
642
  for micro in self.summarization_ladder.micro_summaries:
643
  if temporal_range[0] <= micro.creation_timestamp <= temporal_range[1]:
644
  temporal_items.append(
645
+ ("micro_summary", micro.summary_id, micro.creation_timestamp, micro)
 
646
  )
647
 
648
  # Sort by timestamp
 
681
  anchor_connections=[],
682
  provenance_depth=2,
683
  conflict_flags=[],
684
+ metadata={"timestamp": timestamp, "window_size": micro.window_size},
 
685
  )
686
  results.append(result)
687
 
688
  return results
689
 
690
+ def _retrieve_anchor_neighborhood(self, query: RetrievalQuery) -> List[RetrievalResult]:
 
691
  """Retrieve content in the neighborhood of specific anchors."""
692
  results = []
693
 
 
724
  anchor_connections=[anchor_id, other_id],
725
  provenance_depth=1,
726
  conflict_flags=[],
727
+ metadata={"neighbor_of": anchor_id, "similarity": similarity},
 
728
  )
729
  results.append(result)
730
 
731
  return results
732
 
733
+ def _retrieve_provenance_chain(self, query: RetrievalQuery) -> List[RetrievalResult]:
 
734
  """Retrieve content following provenance relationships."""
735
  results = []
736
 
 
761
  results.append(result)
762
 
763
  # Add related content from update history
764
+ for i, update in enumerate(anchor.provenance.update_history):
 
765
  if i >= query.max_results - 1:
766
  break
767
 
 
771
  content_id=f"{anchor_id}_update_{i}",
772
  content=f"Update: {update.get('context', {}).get('mist_id', 'unknown')}",
773
  relevance_score=0.8 - (i * 0.1),
774
+ temporal_distance=abs(update["timestamp"] - query.query_timestamp),
 
775
  anchor_connections=[anchor_id],
776
  provenance_depth=i + 1,
777
  conflict_flags=[],
778
+ metadata={"update_context": update.get("context", {})},
 
779
  )
780
  results.append(result)
781
 
782
  return results
783
 
784
+ def _retrieve_conflict_aware(self, query: RetrievalQuery) -> List[RetrievalResult]:
 
785
  """Retrieve content while avoiding conflicts."""
786
  # First get base results
787
  base_results = self._retrieve_semantic_similarity(query)
 
797
 
798
  # Check for conflicts involving this content
799
  if hasattr(self.conflict_detector, "get_conflict_analysis"):
800
+ conflict_analysis = self.conflict_detector.get_conflict_analysis(result.content_id)
 
801
  if conflict_analysis.get("conflicts_found", 0) > 0:
802
  conflicts = [
803
  f"conflict_confidence_{
804
  conflict_analysis.get(
805
  'max_confidence',
806
+ 0):.2f}"
807
+ ]
808
 
809
  # Include result but flag conflicts
810
  result.conflict_flags = conflicts
 
813
 
814
  return filtered_results
815
 
816
+ def _retrieve_composite(self, query: RetrievalQuery) -> List[RetrievalResult]:
 
 
817
  """Retrieve using multiple modes and combine results."""
818
  all_results = []
819
 
 
856
  self.metrics["hybrid_queries"] += 1
857
 
858
  # Filter by confidence threshold
859
+ filtered = [r for r in results if r.relevance_score >= query.confidence_threshold]
 
860
 
861
  # Apply temporal decay
862
  current_time = query.query_timestamp
863
  for result in filtered:
864
  age_hours = result.temporal_distance / 3600
865
+ decay_factor = max(0.1, 1.0 - (age_hours / self.temporal_decay_hours))
 
866
  result.relevance_score *= decay_factor
867
 
868
  # Sort by relevance score
 
895
  # Temporal span
896
  timestamps = [r.temporal_distance for r in results]
897
  temporal_span_hours = (
898
+ (max(timestamps) - min(timestamps)) / 3600 if len(timestamps) > 1 else 0
 
899
  )
900
 
901
  # Anchor coverage
 
925
  retrieval_timestamp=time.time(),
926
  )
927
 
928
+ def _calculate_temporal_distance(self, timestamp: float, reference_time: float) -> float:
 
 
 
929
  """Calculate temporal distance between two timestamps."""
930
  return abs(timestamp - reference_time)
931
 
932
+ def _calculate_temporal_relevance(self, timestamp: float, reference_time: float) -> float:
 
 
 
933
  """Calculate relevance based on temporal proximity."""
934
  distance_seconds = abs(timestamp - reference_time)
935
  distance_hours = distance_seconds / 3600
 
1011
  def _update_metrics(self, assembly: ContextAssembly, elapsed_ms: float):
1012
  """Update performance metrics."""
1013
  self.metrics["average_results_per_query"] = (
1014
+ self.metrics["average_results_per_query"] * (self.metrics["total_queries"] - 1)
 
1015
  + len(assembly.results)
1016
  ) / self.metrics["total_queries"]
1017
 
1018
  self.metrics["average_retrieval_time_ms"] = (
1019
+ self.metrics["average_retrieval_time_ms"] * (self.metrics["total_queries"] - 1)
 
1020
  + elapsed_ms
1021
  ) / self.metrics["total_queries"]
1022
 
 
1030
 
1031
  def _calculate_cache_hit_rate(self) -> float:
1032
  """Calculate cache hit rate."""
1033
+ total_requests = self.metrics["cache_hits"] + self.metrics["cache_misses"]
 
1034
  if total_requests == 0:
1035
  return 0.0
1036
  return self.metrics["cache_hits"] / total_requests
 
1176
  try:
1177
  q_stat7_dict = query.stat7_address
1178
  query_realm = Realm(
1179
+ type=q_stat7_dict["realm"]["type"], label=q_stat7_dict["realm"]["label"]
1180
+ )
1181
  query_stat7 = STAT7Address(
1182
  realm=query_realm,
1183
  lineage=q_stat7_dict["lineage"],
 
1207
 
1208
  try:
1209
  doc_realm = Realm(
1210
+ type=doc_stat7_dict["realm"]["type"], label=doc_stat7_dict["realm"]["label"]
1211
+ )
1212
  doc_stat7 = STAT7Address(
1213
  realm=doc_realm,
1214
  lineage=doc_stat7_dict["lineage"],
 
1223
  continue
1224
 
1225
  # Compute STAT7 resonance score
1226
+ stat7_res = self.stat7_bridge.stat7_resonance(query_stat7, doc_stat7)
 
1227
  result.stat7_resonance = stat7_res
1228
 
1229
  # Compute semantic similarity (if available)
 
1231
  result.semantic_similarity = semantic_sim
1232
 
1233
  # Combine into hybrid score
1234
+ hybrid = (query.weight_semantic * semantic_sim) + (query.weight_stat7 * stat7_res)
 
1235
  result.relevance_score = max(0.0, min(hybrid, 1.0))
1236
 
1237
  return results
warbler_cda/stat7_entity.py CHANGED
@@ -251,8 +251,7 @@ class STAT7Entity(ABC):
251
  # Event Tracking
252
  # ========================================================================
253
 
254
- def _record_event(self, event_type: str, description: str,
255
- metadata: Dict[str, Any] = None):
256
  """Record a lifecycle event"""
257
  event = LifecycleEvent(
258
  timestamp=datetime.utcnow(),
@@ -263,12 +262,9 @@ class STAT7Entity(ABC):
263
  self.lifecycle_events.append(event)
264
  self.last_activity = event.timestamp
265
 
266
- def get_event_history(
267
- self,
268
- limit: Optional[int] = None) -> List[LifecycleEvent]:
269
  """Get lifecycle events, optionally limited to most recent"""
270
- events = sorted(self.lifecycle_events,
271
- key=lambda e: e.timestamp, reverse=True)
272
  return events[:limit] if limit else events
273
 
274
  # ========================================================================
@@ -284,8 +280,7 @@ class STAT7Entity(ABC):
284
  self.entangled_entities.append(other_entity_id)
285
  self.entanglement_strength.append(strength)
286
  self._record_event(
287
- "entanglement_added", f"Entangled with {other_entity_id}", {
288
- "strength": strength}
289
  )
290
 
291
  def remove_entanglement(self, other_entity_id: str):
@@ -294,17 +289,13 @@ class STAT7Entity(ABC):
294
  idx = self.entangled_entities.index(other_entity_id)
295
  self.entangled_entities.pop(idx)
296
  self.entanglement_strength.pop(idx)
297
- self._record_event("entanglement_removed",
298
- f"Untangled from {other_entity_id}")
299
 
300
  def get_entanglements(self) -> List[Tuple[str, float]]:
301
  """Get all entangled entities with strength"""
302
  return list(zip(self.entangled_entities, self.entanglement_strength))
303
 
304
- def update_entanglement_strength(
305
- self,
306
- other_entity_id: str,
307
- new_strength: float):
308
  """Update entanglement strength with another entity"""
309
  if other_entity_id in self.entangled_entities:
310
  idx = self.entangled_entities.index(other_entity_id)
@@ -365,8 +356,7 @@ class STAT7Entity(ABC):
365
  {"trait_type": "Realm", "value": self.stat7.realm.value},
366
  {"trait_type": "Lineage", "value": self.stat7.lineage},
367
  {"trait_type": "Horizon", "value": self.stat7.horizon.value},
368
- {"trait_type": "Luminosity",
369
- "value": int(self.stat7.luminosity)},
370
  {"trait_type": "Polarity", "value": self.stat7.polarity.value},
371
  {"trait_type": "Dimensionality", "value": self.stat7.dimensionality},
372
  {"trait_type": "STAT7 Address", "value": self.stat7.address},
@@ -374,11 +364,7 @@ class STAT7Entity(ABC):
374
  "properties": card_data.get("properties", {}),
375
  }
376
 
377
- def record_mint(
378
- self,
379
- contract_address: str,
380
- token_id: int,
381
- ipfs_hash: str):
382
  """Record successful NFT minting"""
383
  self.nft_minted = True
384
  self.nft_contract = contract_address
@@ -387,8 +373,7 @@ class STAT7Entity(ABC):
387
  self._record_event(
388
  "nft_minted",
389
  f"Minted as ERC-721 token #{token_id}",
390
- {"contract": contract_address,
391
- "token_id": token_id, "ipfs_hash": ipfs_hash},
392
  )
393
 
394
  # ========================================================================
@@ -501,8 +486,7 @@ class STAT7Entity(ABC):
501
  "type": "entity_profile",
502
  **card_data,
503
  "owner": self.owner_id,
504
- "lifecycle_events": [
505
- e.to_dict() for e in self.lifecycle_events],
506
  "entanglements": self.get_entanglements(),
507
  "luca_trace": self.get_luca_trace(),
508
  }
@@ -515,8 +499,7 @@ class STAT7Entity(ABC):
515
  "stat7_dimensions": self.stat7.to_dict(),
516
  "realm_details": self._get_realm_details(),
517
  "entanglement_network": self.get_entanglements(),
518
- "event_chronology": [
519
- e.to_dict() for e in self.lifecycle_events],
520
  }
521
 
522
  def _get_realm_details(self) -> Dict[str, Any]:
 
251
  # Event Tracking
252
  # ========================================================================
253
 
254
+ def _record_event(self, event_type: str, description: str, metadata: Dict[str, Any] = None):
 
255
  """Record a lifecycle event"""
256
  event = LifecycleEvent(
257
  timestamp=datetime.utcnow(),
 
262
  self.lifecycle_events.append(event)
263
  self.last_activity = event.timestamp
264
 
265
+ def get_event_history(self, limit: Optional[int] = None) -> List[LifecycleEvent]:
 
 
266
  """Get lifecycle events, optionally limited to most recent"""
267
+ events = sorted(self.lifecycle_events, key=lambda e: e.timestamp, reverse=True)
 
268
  return events[:limit] if limit else events
269
 
270
  # ========================================================================
 
280
  self.entangled_entities.append(other_entity_id)
281
  self.entanglement_strength.append(strength)
282
  self._record_event(
283
+ "entanglement_added", f"Entangled with {other_entity_id}", {"strength": strength}
 
284
  )
285
 
286
  def remove_entanglement(self, other_entity_id: str):
 
289
  idx = self.entangled_entities.index(other_entity_id)
290
  self.entangled_entities.pop(idx)
291
  self.entanglement_strength.pop(idx)
292
+ self._record_event("entanglement_removed", f"Untangled from {other_entity_id}")
 
293
 
294
  def get_entanglements(self) -> List[Tuple[str, float]]:
295
  """Get all entangled entities with strength"""
296
  return list(zip(self.entangled_entities, self.entanglement_strength))
297
 
298
+ def update_entanglement_strength(self, other_entity_id: str, new_strength: float):
 
 
 
299
  """Update entanglement strength with another entity"""
300
  if other_entity_id in self.entangled_entities:
301
  idx = self.entangled_entities.index(other_entity_id)
 
356
  {"trait_type": "Realm", "value": self.stat7.realm.value},
357
  {"trait_type": "Lineage", "value": self.stat7.lineage},
358
  {"trait_type": "Horizon", "value": self.stat7.horizon.value},
359
+ {"trait_type": "Luminosity", "value": int(self.stat7.luminosity)},
 
360
  {"trait_type": "Polarity", "value": self.stat7.polarity.value},
361
  {"trait_type": "Dimensionality", "value": self.stat7.dimensionality},
362
  {"trait_type": "STAT7 Address", "value": self.stat7.address},
 
364
  "properties": card_data.get("properties", {}),
365
  }
366
 
367
+ def record_mint(self, contract_address: str, token_id: int, ipfs_hash: str):
 
 
 
 
368
  """Record successful NFT minting"""
369
  self.nft_minted = True
370
  self.nft_contract = contract_address
 
373
  self._record_event(
374
  "nft_minted",
375
  f"Minted as ERC-721 token #{token_id}",
376
+ {"contract": contract_address, "token_id": token_id, "ipfs_hash": ipfs_hash},
 
377
  )
378
 
379
  # ========================================================================
 
486
  "type": "entity_profile",
487
  **card_data,
488
  "owner": self.owner_id,
489
+ "lifecycle_events": [e.to_dict() for e in self.lifecycle_events],
 
490
  "entanglements": self.get_entanglements(),
491
  "luca_trace": self.get_luca_trace(),
492
  }
 
499
  "stat7_dimensions": self.stat7.to_dict(),
500
  "realm_details": self._get_realm_details(),
501
  "entanglement_network": self.get_entanglements(),
502
+ "event_chronology": [e.to_dict() for e in self.lifecycle_events],
 
503
  }
504
 
505
  def _get_realm_details(self) -> Dict[str, Any]:
warbler_cda/stat7_experiments.py CHANGED
@@ -63,8 +63,7 @@ def normalize_float(value: float, decimal_places: int = 8) -> str:
63
 
64
  # Use Decimal for precise rounding
65
  d = Decimal(str(value))
66
- quantized = d.quantize(Decimal(10) ** -decimal_places,
67
- rounding=ROUND_HALF_EVEN)
68
 
69
  # Convert to string and strip trailing zeros (but keep at least one
70
  # decimal)
@@ -138,8 +137,7 @@ def canonical_serialize(data: Dict[str, Any]) -> str:
138
  sorted_data = sort_json_keys(data)
139
 
140
  # Serialize with no whitespace, ensure_ascii=False to preserve Unicode
141
- canonical = json.dumps(sorted_data, separators=(
142
- ",", ":"), ensure_ascii=True, sort_keys=False)
143
 
144
  return canonical
145
 
@@ -238,8 +236,7 @@ class BitChain:
238
  def get_stat7_uri(self) -> str:
239
  """Generate STAT7 URI address format."""
240
  coords = self.coordinates
241
- adjacency_hash = compute_address_hash(
242
- {"adjacency": sorted(coords.adjacency)})[:8]
243
 
244
  uri = f"stat7://{coords.realm}/{coords.lineage}/{adjacency_hash}/{coords.horizon}"
245
  uri += f"?r={normalize_float(coords.resonance)}"
@@ -255,8 +252,7 @@ class BitChain:
255
 
256
  REALMS = ["data", "narrative", "system", "faculty", "event", "pattern", "void"]
257
  HORIZONS = ["genesis", "emergence", "peak", "decay", "crystallization"]
258
- ENTITY_TYPES = ["concept", "artifact", "agent",
259
- "lineage", "adjacency", "horizon", "fragment"]
260
 
261
 
262
  def generate_random_bitchain(seed: Optional[int] = None) -> BitChain:
@@ -342,11 +338,8 @@ class EXP01_AddressUniqueness:
342
  for iteration in range(self.iterations):
343
  # Generate random bit-chains
344
  bitchains = [
345
- generate_random_bitchain(
346
- seed=iteration *
347
- 1000 +
348
- i) for i in range(
349
- self.sample_size)]
350
 
351
  # Compute addresses
352
  addresses = set()
@@ -387,15 +380,14 @@ class EXP01_AddressUniqueness:
387
 
388
  if collision_pairs:
389
  for addr, ids in collision_pairs.items():
390
- print(
391
- f" ⚠️ Collision on {addr[:16]}... : {len(ids)} entries")
392
 
393
  print()
394
  print(
395
  f"OVERALL RESULT: {
396
- '✅ ALL PASS' if all_success else '❌ SOME FAILED'}")
397
- print(
398
- f"Success rate: {sum(1 for r in self.results if r.success)}/{self.iterations}")
399
 
400
  return self.results, all_success
401
 
@@ -474,8 +466,7 @@ class EXP02_RetrievalEfficiency:
474
  print(f"Testing scale: {scale:,} bit-chains")
475
 
476
  # Generate bit-chains
477
- bitchains = [generate_random_bitchain(
478
- seed=i) for i in range(scale)]
479
 
480
  # Index by address for O(1) retrieval simulation
481
  address_to_bc = {bc.compute_address(): bc for bc in bitchains}
@@ -532,7 +523,8 @@ class EXP02_RetrievalEfficiency:
532
 
533
  print(
534
  f"OVERALL RESULT: {
535
- '✅ ALL PASS' if all_success else '❌ SOME FAILED'}")
 
536
 
537
  return self.results, all_success
538
 
@@ -606,8 +598,7 @@ class EXP03_DimensionNecessity:
606
 
607
  # Baseline: all 7 dimensions
608
  print("Baseline: All 7 dimensions")
609
- bitchains = [generate_random_bitchain(
610
- seed=i) for i in range(self.sample_size)]
611
  addresses = set()
612
  collisions = 0
613
 
@@ -631,7 +622,8 @@ class EXP03_DimensionNecessity:
631
  status = "✅ PASS" if result.acceptable else "❌ FAIL"
632
  print(
633
  f" {status} | Collisions: {collisions} | Rate: {
634
- baseline_collision_rate * 100:.4f}%")
 
635
  print()
636
 
637
  # Ablation: remove each dimension
@@ -661,8 +653,7 @@ class EXP03_DimensionNecessity:
661
  acceptable = collision_rate < 0.001 # Should be unacceptable without each dim
662
 
663
  result = EXP03_Result(
664
- dimensions_used=[
665
- d for d in self.STAT7_DIMENSIONS if d != removed_dim],
666
  sample_size=self.sample_size,
667
  collisions=collisions,
668
  collision_rate=collision_rate,
@@ -676,7 +667,8 @@ class EXP03_DimensionNecessity:
676
  status = "✅ NECESSARY" if necessity else "⚠️ OPTIONAL"
677
  print(
678
  f" {status} | Collisions: {collisions} | Rate: {
679
- collision_rate * 100:.4f}%")
 
680
 
681
  print()
682
  print(
@@ -720,8 +712,7 @@ def run_all_experiments(
720
  results = {}
721
 
722
  # EXP-01
723
- exp01 = EXP01_AddressUniqueness(
724
- sample_size=exp01_samples, iterations=exp01_iterations)
725
  _, exp01_success = exp01.run()
726
  results["EXP-01"] = {
727
  "success": exp01_success,
@@ -760,7 +751,8 @@ def run_all_experiments(
760
  print(
761
  f"\nOverall Phase 1 Status: {
762
  '✅ READY FOR PHASE 2' if all(
763
- r['success'] for r in results.values()) else '❌ NEEDS WORK'}")
 
764
 
765
  return results
766
 
 
63
 
64
  # Use Decimal for precise rounding
65
  d = Decimal(str(value))
66
+ quantized = d.quantize(Decimal(10) ** -decimal_places, rounding=ROUND_HALF_EVEN)
 
67
 
68
  # Convert to string and strip trailing zeros (but keep at least one
69
  # decimal)
 
137
  sorted_data = sort_json_keys(data)
138
 
139
  # Serialize with no whitespace, ensure_ascii=False to preserve Unicode
140
+ canonical = json.dumps(sorted_data, separators=(",", ":"), ensure_ascii=True, sort_keys=False)
 
141
 
142
  return canonical
143
 
 
236
  def get_stat7_uri(self) -> str:
237
  """Generate STAT7 URI address format."""
238
  coords = self.coordinates
239
+ adjacency_hash = compute_address_hash({"adjacency": sorted(coords.adjacency)})[:8]
 
240
 
241
  uri = f"stat7://{coords.realm}/{coords.lineage}/{adjacency_hash}/{coords.horizon}"
242
  uri += f"?r={normalize_float(coords.resonance)}"
 
252
 
253
  REALMS = ["data", "narrative", "system", "faculty", "event", "pattern", "void"]
254
  HORIZONS = ["genesis", "emergence", "peak", "decay", "crystallization"]
255
+ ENTITY_TYPES = ["concept", "artifact", "agent", "lineage", "adjacency", "horizon", "fragment"]
 
256
 
257
 
258
  def generate_random_bitchain(seed: Optional[int] = None) -> BitChain:
 
338
  for iteration in range(self.iterations):
339
  # Generate random bit-chains
340
  bitchains = [
341
+ generate_random_bitchain(seed=iteration * 1000 + i) for i in range(self.sample_size)
342
+ ]
 
 
 
343
 
344
  # Compute addresses
345
  addresses = set()
 
380
 
381
  if collision_pairs:
382
  for addr, ids in collision_pairs.items():
383
+ print(f" ⚠️ Collision on {addr[:16]}... : {len(ids)} entries")
 
384
 
385
  print()
386
  print(
387
  f"OVERALL RESULT: {
388
+ '✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
389
+ )
390
+ print(f"Success rate: {sum(1 for r in self.results if r.success)}/{self.iterations}")
391
 
392
  return self.results, all_success
393
 
 
466
  print(f"Testing scale: {scale:,} bit-chains")
467
 
468
  # Generate bit-chains
469
+ bitchains = [generate_random_bitchain(seed=i) for i in range(scale)]
 
470
 
471
  # Index by address for O(1) retrieval simulation
472
  address_to_bc = {bc.compute_address(): bc for bc in bitchains}
 
523
 
524
  print(
525
  f"OVERALL RESULT: {
526
+ '✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
527
+ )
528
 
529
  return self.results, all_success
530
 
 
598
 
599
  # Baseline: all 7 dimensions
600
  print("Baseline: All 7 dimensions")
601
+ bitchains = [generate_random_bitchain(seed=i) for i in range(self.sample_size)]
 
602
  addresses = set()
603
  collisions = 0
604
 
 
622
  status = "✅ PASS" if result.acceptable else "❌ FAIL"
623
  print(
624
  f" {status} | Collisions: {collisions} | Rate: {
625
+ baseline_collision_rate * 100:.4f}%"
626
+ )
627
  print()
628
 
629
  # Ablation: remove each dimension
 
653
  acceptable = collision_rate < 0.001 # Should be unacceptable without each dim
654
 
655
  result = EXP03_Result(
656
+ dimensions_used=[d for d in self.STAT7_DIMENSIONS if d != removed_dim],
 
657
  sample_size=self.sample_size,
658
  collisions=collisions,
659
  collision_rate=collision_rate,
 
667
  status = "✅ NECESSARY" if necessity else "⚠️ OPTIONAL"
668
  print(
669
  f" {status} | Collisions: {collisions} | Rate: {
670
+ collision_rate * 100:.4f}%"
671
+ )
672
 
673
  print()
674
  print(
 
712
  results = {}
713
 
714
  # EXP-01
715
+ exp01 = EXP01_AddressUniqueness(sample_size=exp01_samples, iterations=exp01_iterations)
 
716
  _, exp01_success = exp01.run()
717
  results["EXP-01"] = {
718
  "success": exp01_success,
 
751
  print(
752
  f"\nOverall Phase 1 Status: {
753
  '✅ READY FOR PHASE 2' if all(
754
+ r['success'] for r in results.values()) else '❌ NEEDS WORK'}"
755
+ )
756
 
757
  return results
758
 
warbler_cda/stat7_rag_bridge.py CHANGED
@@ -55,11 +55,17 @@ class STAT7Address:
55
 
56
  def __post_init__(self):
57
  """Validate STAT7 constraints."""
58
- assert 0.0 <= self.adjacency <= 1.0, f"adjacency must be [0,1], got {
 
 
59
  self.adjacency}"
60
- assert 0.0 <= self.luminosity <= 1.0, f"luminosity must be [0,1], got {
 
 
61
  self.luminosity}"
62
- assert 0.0 <= self.polarity <= 1.0, f"polarity must be [0,1], got {
 
 
63
  self.polarity}"
64
  assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
65
  assert (
@@ -91,8 +97,7 @@ class RAGDocument:
91
 
92
  def __post_init__(self):
93
  """Validate document structure."""
94
- assert len(
95
- self.embedding) > 0, f"embedding must not be empty for {self.id}"
96
 
97
 
98
  # ============================================================================
@@ -116,9 +121,7 @@ def cosine_similarity(a: List[float], b: List[float]) -> float:
116
  return dot / denom
117
 
118
 
119
- def stat7_resonance(
120
- query_stat7: STAT7Address,
121
- doc_stat7: STAT7Address) -> float:
122
  """
123
  Compute STAT7 resonance between query and document addresses.
124
 
@@ -232,8 +235,7 @@ def retrieve(
232
  """
233
  scores = []
234
  for doc in documents:
235
- score = hybrid_score(query_embedding, doc,
236
- query_stat7, weight_semantic, weight_stat7)
237
  scores.append((doc.id, score))
238
 
239
  # Sort by score descending, return top-k
@@ -389,14 +391,11 @@ def compare_retrieval_results(
389
  overlap = len(semantic_ids & hybrid_ids)
390
  overlap_pct = (overlap / k * 100) if k > 0 else 0.0
391
 
392
- semantic_avg = sum(
393
- score for _, score in semantic_results[:k]) / k if k > 0 else 0.0
394
- hybrid_avg = sum(
395
- score for _, score in hybrid_results[:k]) / k if k > 0 else 0.0
396
 
397
  # Measure ranking distance: how far did top-k items move?
398
- semantic_rank = {doc_id: idx for idx,
399
- (doc_id, _) in enumerate(semantic_results[:k])}
400
  reranking_distances = []
401
  for idx, (doc_id, _) in enumerate(hybrid_results[:k]):
402
  if doc_id in semantic_rank:
@@ -404,8 +403,7 @@ def compare_retrieval_results(
404
  reranking_distances.append(distance)
405
 
406
  avg_reranking_distance = (
407
- sum(reranking_distances) /
408
- len(reranking_distances) if reranking_distances else 0.0
409
  )
410
 
411
  return {
@@ -434,8 +432,7 @@ class STAT7RAGBridge:
434
  dependency injection.
435
  """
436
 
437
- def stat7_resonance(self, query_stat7: STAT7Address,
438
- doc_stat7: STAT7Address) -> float:
439
  """
440
  Compute STAT7 resonance between query and document addresses.
441
 
@@ -467,12 +464,7 @@ class STAT7RAGBridge:
467
 
468
  Returns: [0.0, 1.0] hybrid score
469
  """
470
- return hybrid_score(
471
- query_embedding,
472
- doc,
473
- query_stat7,
474
- weight_semantic,
475
- weight_stat7)
476
 
477
  def retrieve(
478
  self,
@@ -496,10 +488,4 @@ class STAT7RAGBridge:
496
 
497
  Returns: List of (doc_id, hybrid_score) tuples, sorted by score (descending)
498
  """
499
- return retrieve(
500
- documents,
501
- query_embedding,
502
- query_stat7,
503
- k,
504
- weight_semantic,
505
- weight_stat7)
 
55
 
56
  def __post_init__(self):
57
  """Validate STAT7 constraints."""
58
+ assert (
59
+ 0.0 <= self.adjacency <= 1.0
60
+ ), f"adjacency must be [0,1], got {
61
  self.adjacency}"
62
+ assert (
63
+ 0.0 <= self.luminosity <= 1.0
64
+ ), f"luminosity must be [0,1], got {
65
  self.luminosity}"
66
+ assert (
67
+ 0.0 <= self.polarity <= 1.0
68
+ ), f"polarity must be [0,1], got {
69
  self.polarity}"
70
  assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
71
  assert (
 
97
 
98
  def __post_init__(self):
99
  """Validate document structure."""
100
+ assert len(self.embedding) > 0, f"embedding must not be empty for {self.id}"
 
101
 
102
 
103
  # ============================================================================
 
121
  return dot / denom
122
 
123
 
124
+ def stat7_resonance(query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float:
 
 
125
  """
126
  Compute STAT7 resonance between query and document addresses.
127
 
 
235
  """
236
  scores = []
237
  for doc in documents:
238
+ score = hybrid_score(query_embedding, doc, query_stat7, weight_semantic, weight_stat7)
 
239
  scores.append((doc.id, score))
240
 
241
  # Sort by score descending, return top-k
 
391
  overlap = len(semantic_ids & hybrid_ids)
392
  overlap_pct = (overlap / k * 100) if k > 0 else 0.0
393
 
394
+ semantic_avg = sum(score for _, score in semantic_results[:k]) / k if k > 0 else 0.0
395
+ hybrid_avg = sum(score for _, score in hybrid_results[:k]) / k if k > 0 else 0.0
 
 
396
 
397
  # Measure ranking distance: how far did top-k items move?
398
+ semantic_rank = {doc_id: idx for idx, (doc_id, _) in enumerate(semantic_results[:k])}
 
399
  reranking_distances = []
400
  for idx, (doc_id, _) in enumerate(hybrid_results[:k]):
401
  if doc_id in semantic_rank:
 
403
  reranking_distances.append(distance)
404
 
405
  avg_reranking_distance = (
406
+ sum(reranking_distances) / len(reranking_distances) if reranking_distances else 0.0
 
407
  )
408
 
409
  return {
 
432
  dependency injection.
433
  """
434
 
435
+ def stat7_resonance(self, query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float:
 
436
  """
437
  Compute STAT7 resonance between query and document addresses.
438
 
 
464
 
465
  Returns: [0.0, 1.0] hybrid score
466
  """
467
+ return hybrid_score(query_embedding, doc, query_stat7, weight_semantic, weight_stat7)
 
 
 
 
 
468
 
469
  def retrieve(
470
  self,
 
488
 
489
  Returns: List of (doc_id, hybrid_score) tuples, sorted by score (descending)
490
  """
491
+ return retrieve(documents, query_embedding, query_stat7, k, weight_semantic, weight_stat7)
 
 
 
 
 
 
warbler_cda/utils/load_warbler_packs.py CHANGED
@@ -52,26 +52,19 @@ class WarblerPackLoader:
52
  return []
53
 
54
  # Look for JSON, YAML, markdown, and JSONL files
55
- for pattern in [
56
- "**/*.json",
57
- "**/*.yaml",
58
- "**/*.yml",
59
- "**/*.md",
60
- "**/*.jsonl"]:
61
  for file_path in pack_path.glob(pattern):
62
  try:
63
  doc = self._parse_document(file_path, pack_name)
64
  if doc:
65
  documents.append(doc)
66
- logger.info(
67
- f"Discovered: {file_path.relative_to(PACKS_DIR)}")
68
  except Exception as e:
69
  logger.error(f"Error parsing {file_path}: {e}")
70
 
71
  return documents
72
 
73
- def _parse_document(self, file_path: Path,
74
- pack_name: str) -> Dict[str, Any]:
75
  """Parse a document file"""
76
  try:
77
  if file_path.suffix in [".json"]:
@@ -86,8 +79,7 @@ class WarblerPackLoader:
86
  # We'll read the first few lines and combine them
87
  with open(file_path, "r", encoding="utf-8") as f:
88
  lines = f.readlines()[:5] # First 5 lines
89
- content = "\n".join(line.strip()
90
- for line in lines if line.strip())
91
  elif file_path.suffix in [".yaml", ".yml"]:
92
  import yaml
93
 
@@ -141,8 +133,7 @@ class WarblerPackLoader:
141
  logger.info(f"[OK] Loaded: {doc['content_id']}")
142
  return True
143
  else:
144
- logger.warning(
145
- f"API returned {response.status_code}: {response.text[:200]}")
146
  return False
147
  except requests.exceptions.ConnectionError:
148
  logger.error("Cannot connect to API. Is the service running?")
@@ -174,7 +165,8 @@ class WarblerPackLoader:
174
  click.secho(
175
  f"[OK] Load Complete: {
176
  self.loaded_count} docs ingested",
177
- fg="green")
 
178
  if self.error_count > 0:
179
  click.secho(f"[ERROR] Errors: {self.error_count}", fg="yellow")
180
  click.echo("=" * 60 + "\n")
@@ -189,9 +181,7 @@ def cli():
189
 
190
 
191
  @cli.command()
192
- @click.option("--api-url",
193
- default="http://localhost:8000",
194
- help="API service URL")
195
  def load(api_url):
196
  """Load all Warbler packs into the API"""
197
  loader = WarblerPackLoader(api_url)
@@ -202,8 +192,7 @@ def load(api_url):
202
  if response.status_code == 200:
203
  click.secho("[OK] API service is running", fg="green")
204
  else:
205
- click.secho(
206
- "[ERROR] API service not responding correctly", fg="red")
207
  return
208
  except Exception as e:
209
  click.secho(f"[ERROR] Cannot reach API at {api_url}: {e}", fg="red")
@@ -225,9 +214,7 @@ def load(api_url):
225
 
226
 
227
  @cli.command()
228
- @click.option("--api-url",
229
- default="http://localhost:8000",
230
- help="API service URL")
231
  def discover(api_url):
232
  """Discover documents in Warbler packs (no loading)"""
233
  loader = WarblerPackLoader(api_url)
@@ -251,7 +238,8 @@ def discover(api_url):
251
  f" Realm: {
252
  doc['metadata'].get(
253
  'realm_type',
254
- 'unknown')}")
 
255
 
256
  click.echo(f"\n[STATS] Total discovered: {total} documents\n")
257
 
 
52
  return []
53
 
54
  # Look for JSON, YAML, markdown, and JSONL files
55
+ for pattern in ["**/*.json", "**/*.yaml", "**/*.yml", "**/*.md", "**/*.jsonl"]:
 
 
 
 
 
56
  for file_path in pack_path.glob(pattern):
57
  try:
58
  doc = self._parse_document(file_path, pack_name)
59
  if doc:
60
  documents.append(doc)
61
+ logger.info(f"Discovered: {file_path.relative_to(PACKS_DIR)}")
 
62
  except Exception as e:
63
  logger.error(f"Error parsing {file_path}: {e}")
64
 
65
  return documents
66
 
67
+ def _parse_document(self, file_path: Path, pack_name: str) -> Dict[str, Any]:
 
68
  """Parse a document file"""
69
  try:
70
  if file_path.suffix in [".json"]:
 
79
  # We'll read the first few lines and combine them
80
  with open(file_path, "r", encoding="utf-8") as f:
81
  lines = f.readlines()[:5] # First 5 lines
82
+ content = "\n".join(line.strip() for line in lines if line.strip())
 
83
  elif file_path.suffix in [".yaml", ".yml"]:
84
  import yaml
85
 
 
133
  logger.info(f"[OK] Loaded: {doc['content_id']}")
134
  return True
135
  else:
136
+ logger.warning(f"API returned {response.status_code}: {response.text[:200]}")
 
137
  return False
138
  except requests.exceptions.ConnectionError:
139
  logger.error("Cannot connect to API. Is the service running?")
 
165
  click.secho(
166
  f"[OK] Load Complete: {
167
  self.loaded_count} docs ingested",
168
+ fg="green",
169
+ )
170
  if self.error_count > 0:
171
  click.secho(f"[ERROR] Errors: {self.error_count}", fg="yellow")
172
  click.echo("=" * 60 + "\n")
 
181
 
182
 
183
  @cli.command()
184
+ @click.option("--api-url", default="http://localhost:8000", help="API service URL")
 
 
185
  def load(api_url):
186
  """Load all Warbler packs into the API"""
187
  loader = WarblerPackLoader(api_url)
 
192
  if response.status_code == 200:
193
  click.secho("[OK] API service is running", fg="green")
194
  else:
195
+ click.secho("[ERROR] API service not responding correctly", fg="red")
 
196
  return
197
  except Exception as e:
198
  click.secho(f"[ERROR] Cannot reach API at {api_url}: {e}", fg="red")
 
214
 
215
 
216
  @cli.command()
217
+ @click.option("--api-url", default="http://localhost:8000", help="API service URL")
 
 
218
  def discover(api_url):
219
  """Discover documents in Warbler packs (no loading)"""
220
  loader = WarblerPackLoader(api_url)
 
238
  f" Realm: {
239
  doc['metadata'].get(
240
  'realm_type',
241
+ 'unknown')}"
242
+ )
243
 
244
  click.echo(f"\n[STATS] Total discovered: {total} documents\n")
245