Luigi commited on
Commit
77e98bd
·
1 Parent(s): 0a56987

Add speaker name detection feature

Browse files

- Add LLM-based speaker name detection using LangChain prompts
- Only replace speaker IDs with high-confidence name detections
- Add 'Detect Speaker Names' button (only enabled with diarization)
- Update transcript rendering to show detected names
- Add API endpoint /api/detect-speaker-names
- Add SpeakerNameDetectionRequest model
- Update frontend state management for speaker names

frontend/app.js CHANGED
@@ -4,6 +4,7 @@ const state = {
4
  utterances: [],
5
  diarizedUtterances: null,
6
  diarizationStats: null,
 
7
  summary: '',
8
  title: '',
9
  audioUrl: null,
@@ -11,6 +12,7 @@ const state = {
11
  uploadedFile: null,
12
  transcribing: false,
13
  summarizing: false,
 
14
  };
15
 
16
  const elements = {
@@ -29,6 +31,7 @@ const elements = {
29
  sensevoiceLanguage: document.getElementById('sensevoice-language'),
30
  transcribeBtn: document.getElementById('transcribe-btn'),
31
  summaryBtn: document.getElementById('summary-btn'),
 
32
  statusText: document.getElementById('status-text'),
33
  audioPlayer: document.getElementById('audio-player'),
34
  transcriptList: document.getElementById('transcript-list'),
@@ -334,7 +337,13 @@ function renderTranscript() {
334
 
335
  const speakerTag = node.querySelector('.speaker-tag');
336
  if (typeof utt.speaker === 'number') {
337
- speakerTag.textContent = `Speaker ${utt.speaker + 1}`;
 
 
 
 
 
 
338
  speakerTag.classList.remove('hidden');
339
  }
340
 
@@ -347,9 +356,11 @@ function renderTranscript() {
347
  function renderDiarizationStats() {
348
  if (!state.diarizationStats) {
349
  elements.diarizationPanel.classList.add('hidden');
 
350
  return;
351
  }
352
  elements.diarizationPanel.classList.remove('hidden');
 
353
  const stats = state.diarizationStats;
354
 
355
  elements.diarizationMetrics.innerHTML = '';
@@ -544,6 +555,47 @@ async function handleSummaryGeneration() {
544
  }
545
  }
546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
  async function handleExportTranscript() {
548
  if (!state.utterances.length) return;
549
  const payload = {
@@ -721,79 +773,11 @@ async function downloadEpisode(audioUrl, title, triggerButton = null) {
721
  state.uploadedFile = null;
722
  elements.audioPlayer.src = data.audioUrl;
723
  setStatus('Episode ready', 'success');
724
- if (triggerButton) {
725
- triggerButton.textContent = 'Ready ✓';
726
- triggerButton.classList.add('success');
727
- }
728
  } catch (err) {
729
  console.error(err);
730
  setStatus(err.message, 'error');
731
- if (triggerButton) {
732
- triggerButton.textContent = 'Retry';
733
- triggerButton.classList.add('error');
734
- }
735
  } finally {
736
- if (triggerButton) {
737
- triggerButton.disabled = false;
738
- triggerButton.classList.remove('loading');
739
- setTimeout(() => {
740
- triggerButton.classList.remove('success', 'error');
741
- triggerButton.textContent = originalLabel || 'Download';
742
- }, 2000);
743
- }
744
  }
745
  }
746
-
747
- function initPodcastInteractions() {
748
- elements.podcastResults.addEventListener('click', (event) => {
749
- const btn = event.target.closest('button[data-feed]');
750
- if (!btn) return;
751
- const listItem = btn.closest('.list-item');
752
- loadEpisodes(btn.dataset.feed, listItem);
753
- });
754
-
755
- elements.episodeResults.addEventListener('click', (event) => {
756
- const btn = event.target.closest('button[data-url]');
757
- if (!btn) return;
758
- downloadEpisode(btn.dataset.url, btn.dataset.title, btn);
759
- });
760
-
761
- }
762
-
763
- function initEventBindings() {
764
- elements.transcribeBtn.addEventListener('click', handleTranscription);
765
- elements.summaryBtn.addEventListener('click', handleSummaryGeneration);
766
- elements.exportTranscriptBtn.addEventListener('click', handleExportTranscript);
767
- elements.exportSummaryBtn.addEventListener('click', handleExportSummary);
768
- elements.fileInput.addEventListener('change', handleFileUpload);
769
- elements.youtubeFetch.addEventListener('click', handleYoutubeFetch);
770
- elements.podcastSearch.addEventListener('click', handlePodcastSearch);
771
- elements.podcastQuery.addEventListener('keydown', (event) => {
772
- if (event.key === 'Enter') {
773
- event.preventDefault();
774
- handlePodcastSearch();
775
- }
776
- });
777
- }
778
-
779
- async function init() {
780
- initTabs();
781
- initSidebarInteractions();
782
- initAudioInteractions();
783
- initEventBindings();
784
- initPodcastInteractions();
785
-
786
- elements.backendSelect.innerHTML = `
787
- <option value="moonshine">Moonshine</option>
788
- <option value="sensevoice" selected>SenseVoice</option>
789
- `;
790
- state.backend = elements.backendSelect.value;
791
-
792
- setListEmpty(elements.podcastResults, 'Search to discover podcasts.');
793
- setListEmpty(elements.episodeResults, 'Select a podcast to view episodes.');
794
-
795
- await fetchConfig();
796
- setStatus('Ready');
797
- }
798
-
799
- init();
 
4
  utterances: [],
5
  diarizedUtterances: null,
6
  diarizationStats: null,
7
+ speakerNames: {}, // Maps speaker_id to detected name info
8
  summary: '',
9
  title: '',
10
  audioUrl: null,
 
12
  uploadedFile: null,
13
  transcribing: false,
14
  summarizing: false,
15
+ detectingSpeakerNames: false,
16
  };
17
 
18
  const elements = {
 
31
  sensevoiceLanguage: document.getElementById('sensevoice-language'),
32
  transcribeBtn: document.getElementById('transcribe-btn'),
33
  summaryBtn: document.getElementById('summary-btn'),
34
+ detectSpeakerNamesBtn: document.getElementById('detect-speaker-names-btn'),
35
  statusText: document.getElementById('status-text'),
36
  audioPlayer: document.getElementById('audio-player'),
37
  transcriptList: document.getElementById('transcript-list'),
 
337
 
338
  const speakerTag = node.querySelector('.speaker-tag');
339
  if (typeof utt.speaker === 'number') {
340
+ const speakerId = utt.speaker;
341
+ const speakerName = state.speakerNames[speakerId]?.name;
342
+ if (speakerName) {
343
+ speakerTag.textContent = speakerName;
344
+ } else {
345
+ speakerTag.textContent = `Speaker ${speakerId + 1}`;
346
+ }
347
  speakerTag.classList.remove('hidden');
348
  }
349
 
 
356
  function renderDiarizationStats() {
357
  if (!state.diarizationStats) {
358
  elements.diarizationPanel.classList.add('hidden');
359
+ elements.detectSpeakerNamesBtn.classList.add('hidden');
360
  return;
361
  }
362
  elements.diarizationPanel.classList.remove('hidden');
363
+ elements.detectSpeakerNamesBtn.classList.remove('hidden');
364
  const stats = state.diarizationStats;
365
 
366
  elements.diarizationMetrics.innerHTML = '';
 
555
  }
556
  }
557
 
558
+ async function handleSpeakerNameDetection() {
559
+ if (state.detectingSpeakerNames || !state.diarizationStats) return;
560
+
561
+ state.detectingSpeakerNames = true;
562
+ setStatus('Detecting speaker names...', 'info');
563
+
564
+ const payload = {
565
+ utterances: state.utterances,
566
+ llm_model: elements.llmSelect.value,
567
+ };
568
+
569
+ try {
570
+ const response = await fetch('/api/detect-speaker-names', {
571
+ method: 'POST',
572
+ headers: { 'Content-Type': 'application/json' },
573
+ body: JSON.stringify(payload),
574
+ });
575
+
576
+ if (!response.ok) throw new Error('Failed to detect speaker names');
577
+
578
+ const speakerNames = await response.json();
579
+ state.speakerNames = speakerNames;
580
+
581
+ // Re-render transcript to show detected names
582
+ renderTranscript();
583
+
584
+ const detectedCount = Object.keys(speakerNames).length;
585
+ if (detectedCount > 0) {
586
+ setStatus(`Detected names for ${detectedCount} speaker(s)`, 'success');
587
+ } else {
588
+ setStatus('No speaker names could be confidently detected', 'info');
589
+ }
590
+
591
+ } catch (err) {
592
+ console.error(err);
593
+ setStatus(err.message, 'error');
594
+ } finally {
595
+ state.detectingSpeakerNames = false;
596
+ }
597
+ }
598
+
599
  async function handleExportTranscript() {
600
  if (!state.utterances.length) return;
601
  const payload = {
 
773
  state.uploadedFile = null;
774
  elements.audioPlayer.src = data.audioUrl;
775
  setStatus('Episode ready', 'success');
 
 
 
 
776
  } catch (err) {
777
  console.error(err);
778
  setStatus(err.message, 'error');
 
 
 
 
779
  } finally {
780
+ triggerButton.classList.remove('loading');
781
+ triggerButton.textContent = 'Download';
 
 
 
 
 
 
782
  }
783
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/index.html CHANGED
@@ -123,6 +123,7 @@
123
  <section id="results-tab" class="tab-panel">
124
  <div class="actions">
125
  <button id="transcribe-btn" class="primary">Transcribe Audio</button>
 
126
  <button id="summary-btn" class="secondary">Generate Summary</button>
127
  <span id="status-text" class="status-text">Ready</span>
128
  </div>
 
123
  <section id="results-tab" class="tab-panel">
124
  <div class="actions">
125
  <button id="transcribe-btn" class="primary">Transcribe Audio</button>
126
+ <button id="detect-speaker-names-btn" class="secondary hidden">Detect Speaker Names</button>
127
  <button id="summary-btn" class="secondary">Generate Summary</button>
128
  <span id="status-text" class="status-text">Ready</span>
129
  </div>
src/server/models/summarization.py CHANGED
@@ -8,3 +8,8 @@ class SummaryRequest(BaseModel):
8
  llm_model: str
9
  prompt: str = Field("Summarize the transcript below.")
10
  generate_title: bool = Field(default=True)
 
 
 
 
 
 
8
  llm_model: str
9
  prompt: str = Field("Summarize the transcript below.")
10
  generate_title: bool = Field(default=True)
11
+
12
+
13
+ class SpeakerNameDetectionRequest(BaseModel):
14
+ utterances: list = Field(..., min_length=1)
15
+ llm_model: str
src/server/routers/api.py CHANGED
@@ -7,7 +7,7 @@ from fastapi import APIRouter, File, Form, HTTPException, UploadFile
7
  from fastapi.responses import StreamingResponse
8
 
9
  from ..models.export import SummaryExportRequest, TranscriptExportRequest
10
- from ..models.summarization import SummaryRequest
11
  from ..models.transcription import TranscriptionRequest
12
  from ..core.config import get_settings
13
  from ..services import config_service, export_service, podcast_service
@@ -119,3 +119,10 @@ def export_summary(payload: SummaryExportRequest):
119
  media_type=mime_type,
120
  headers={"Content-Disposition": content_disposition},
121
  )
 
 
 
 
 
 
 
 
7
  from fastapi.responses import StreamingResponse
8
 
9
  from ..models.export import SummaryExportRequest, TranscriptExportRequest
10
+ from ..models.summarization import SummaryRequest, SpeakerNameDetectionRequest
11
  from ..models.transcription import TranscriptionRequest
12
  from ..core.config import get_settings
13
  from ..services import config_service, export_service, podcast_service
 
119
  media_type=mime_type,
120
  headers={"Content-Disposition": content_disposition},
121
  )
122
+
123
+
124
+ @router.post("/detect-speaker-names")
125
+ def detect_speaker_names(request: SpeakerNameDetectionRequest):
126
+ from src.summarization import detect_speaker_names as detect_names
127
+ result = detect_names(request.utterances, request.llm_model)
128
+ return result
src/summarization.py CHANGED
@@ -266,6 +266,122 @@ def generate_title(transcript: str, selected_gguf_model: str) -> str:
266
  return "Untitled Document"
267
 
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  # Alias pour maintenir la compatibilité
270
  summarize_transcript = summarize_transcript_langchain
271
 
 
266
  return "Untitled Document"
267
 
268
 
269
+ def create_speaker_name_detection_prompt() -> PromptTemplate:
270
+ """Prompt for detecting speaker names from their utterances"""
271
+ template = """Analyze the following utterances from a single speaker and suggest a name for this speaker. Look for:
272
+
273
+ 1. Self-introductions or self-references
274
+ 2. Names mentioned in context
275
+ 3. Speech patterns, vocabulary, and topics that might indicate identity
276
+ 4. Professional titles, roles, or relationships mentioned
277
+
278
+ Utterances from this speaker:
279
+ {text}
280
+
281
+ Based on the content, suggest a name for this speaker. Consider:
282
+ - If the speaker introduces themselves, use that name
283
+ - If the speaker is addressed by others, use that name
284
+ - If the content suggests a clear identity (e.g., "I'm Dr. Smith", "As CEO", "My name is John")
285
+ - If no clear name is evident, suggest "Unknown"
286
+
287
+ Provide your answer in this exact format:
288
+ NAME: [suggested name]
289
+ CONFIDENCE: [high/medium/low]
290
+ REASON: [brief explanation]
291
+
292
+ If confidence is "low", the name should not be used."""
293
+ return PromptTemplate(template=template, input_variables=["text"])
294
+
295
+
296
+ def detect_speaker_names(utterances: list, selected_gguf_model: str) -> dict:
297
+ """
298
+ Detect speaker names from diarized utterances using LLM analysis.
299
+
300
+ Args:
301
+ utterances: List of utterance dicts with 'speaker', 'text', 'start', 'end' keys
302
+ selected_gguf_model: The LLM model to use for analysis
303
+
304
+ Returns:
305
+ Dict mapping speaker_id to detected name info:
306
+ {
307
+ speaker_id: {
308
+ 'name': str,
309
+ 'confidence': str, # 'high', 'medium', 'low'
310
+ 'reason': str
311
+ }
312
+ }
313
+ """
314
+ if not utterances:
315
+ return {}
316
+
317
+ # Group utterances by speaker
318
+ speaker_utterances = {}
319
+ for utt in utterances:
320
+ speaker_id = utt.get('speaker')
321
+ if speaker_id is not None:
322
+ if speaker_id not in speaker_utterances:
323
+ speaker_utterances[speaker_id] = []
324
+ speaker_utterances[speaker_id].append(utt['text'])
325
+
326
+ if not speaker_utterances:
327
+ return {}
328
+
329
+ try:
330
+ llm = get_llm(selected_gguf_model)
331
+ prompt = create_speaker_name_detection_prompt()
332
+
333
+ speaker_names = {}
334
+
335
+ for speaker_id, texts in speaker_utterances.items():
336
+ # Combine all utterances for this speaker (limit to reasonable length)
337
+ combined_text = ' '.join(texts)
338
+ if len(combined_text) > 4000: # Limit context
339
+ combined_text = combined_text[:4000] + '...'
340
+
341
+ # Format prompt
342
+ formatted_prompt = prompt.format(text=combined_text)
343
+
344
+ # Get LLM response
345
+ response = llm.create_chat_completion(
346
+ messages=[
347
+ {"role": "system", "content": "You are an expert at analyzing speech patterns and identifying speaker identities from transcripts. Be precise and only suggest names when you have clear evidence."},
348
+ {"role": "user", "content": formatted_prompt}
349
+ ],
350
+ stream=False,
351
+ max_tokens=100,
352
+ )
353
+
354
+ result_text = response['choices'][0]['message']['content'].strip()
355
+
356
+ # Parse the response
357
+ name = "Unknown"
358
+ confidence = "low"
359
+ reason = "No clear identification found"
360
+
361
+ lines = result_text.split('\n')
362
+ for line in lines:
363
+ if line.startswith('NAME:'):
364
+ name = line.replace('NAME:', '').strip()
365
+ elif line.startswith('CONFIDENCE:'):
366
+ confidence = line.replace('CONFIDENCE:', '').strip().lower()
367
+ elif line.startswith('REASON:'):
368
+ reason = line.replace('REASON:', '').strip()
369
+
370
+ # Only include high confidence detections
371
+ if confidence == 'high' and name != "Unknown":
372
+ speaker_names[speaker_id] = {
373
+ 'name': name,
374
+ 'confidence': confidence,
375
+ 'reason': reason
376
+ }
377
+
378
+ return speaker_names
379
+
380
+ except Exception as e:
381
+ print(f"Error detecting speaker names: {e}")
382
+ return {}
383
+
384
+
385
  # Alias pour maintenir la compatibilité
386
  summarize_transcript = summarize_transcript_langchain
387