olcapone commited on
Commit
778f12b
·
verified ·
1 Parent(s): e8c6710
Files changed (1) hide show
  1. app.py +56 -7
app.py CHANGED
@@ -97,7 +97,7 @@ class BasicAgent:
97
  try:
98
  # Try text-generation first
99
  out = self.hf.text_generation(
100
- model=model, prompt=prompt, max_new_tokens=128, temperature=0.2
101
  )
102
  return out.strip()
103
  except Exception as e:
@@ -129,6 +129,49 @@ class BasicAgent:
129
  m = re.search(r"[?&]v=([\w-]{6,})", url)
130
  return m.group(1) if m else None
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  def _fetch_yt_html(self, url: str) -> str | None:
133
  try:
134
  r = requests.get(self._yt_mobile_url(url),
@@ -139,15 +182,15 @@ class BasicAgent:
139
  return None
140
 
141
  def _count_bird_species_from_desc(self, html: str) -> int:
142
- text = html.lower()
143
  species = set()
144
  # robust matches (include common variants)
145
- if "emperor penguin" in text:
 
146
  species.add("emperor penguin")
147
- if "adelie penguin" in text or "adélie penguin" in text or "adelie" in text:
148
  species.add("adelie penguin")
149
- if ("giant petrel" in text or "southern giant petrel" in text
150
- or "northern giant petrel" in text):
151
  species.add("giant petrel")
152
  return len(species)
153
 
@@ -161,13 +204,14 @@ class BasicAgent:
161
  url = m.group(0)
162
  html = self._fetch_yt_html(url)
163
  if html:
 
164
  n = self._count_bird_species_from_desc(html)
165
  if n > 0:
166
  return str(n) # EXACT MATCH wants bare number
167
  # Deterministic LLM fallback constrained to description only
168
  yt_sys = (
169
  "Return ONLY the number (digits only, no words, no punctuation). "
170
- "Count the distinct bird species explicitly mentioned in the official video description (e.g., Emperor penguin, Adélie penguin, Giant petrel)."
171
  )
172
  raw = self._llm(f"{yt_sys}\n\nQuestion: {question}")
173
  num = _extract_bare_number(raw)
@@ -180,6 +224,11 @@ class BasicAgent:
180
  if num is not None:
181
  return num
182
 
 
 
 
 
 
183
  # 1) quick math
184
  calc = self._maybe_calc(question)
185
  if calc is not None:
 
97
  try:
98
  # Try text-generation first
99
  out = self.hf.text_generation(
100
+ model=model, prompt=prompt, max_new_tokens=32, temperature=0.0, top_p=1.0
101
  )
102
  return out.strip()
103
  except Exception as e:
 
129
  m = re.search(r"[?&]v=([\w-]{6,})", url)
130
  return m.group(1) if m else None
131
 
132
+ def _extract_yt_text(self, html: str) -> str:
133
+ """Extract a clean text blob from m.youtube.com HTML (description + title)."""
134
+ parts = []
135
+
136
+ # 1) JSON shortDescription
137
+ m = re.search(r'"shortDescription"\s*:\s*"([^"]*)"', html, re.S)
138
+ if m:
139
+ desc = m.group(1)
140
+ # Unescape \n, \uXXXX, etc.
141
+ try:
142
+ desc = bytes(desc, "utf-8").decode("unicode_escape")
143
+ except Exception:
144
+ pass
145
+ parts.append(desc.replace("\\n", " ").replace("\n", " ").strip())
146
+
147
+ # 2) og:description
148
+ m = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', html, re.I)
149
+ if m:
150
+ parts.append(m.group(1).strip())
151
+
152
+ # 3) name="description"
153
+ m = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', html, re.I)
154
+ if m:
155
+ parts.append(m.group(1).strip())
156
+
157
+ # 4) og:title
158
+ m = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', html, re.I)
159
+ if m:
160
+ parts.append(m.group(1).strip())
161
+
162
+ # 5) <title>...</title>
163
+ m = re.search(r'<title>(.*?)</title>', html, re.S | re.I)
164
+ if m:
165
+ parts.append(re.sub(r"\s+", " ", m.group(1)).strip())
166
+
167
+ # De-dup and join
168
+ seen, uniq = set(), []
169
+ for p in parts:
170
+ if p and p not in seen:
171
+ uniq.append(p); seen.add(p)
172
+ return " | ".join(uniq)
173
+
174
+
175
  def _fetch_yt_html(self, url: str) -> str | None:
176
  try:
177
  r = requests.get(self._yt_mobile_url(url),
 
182
  return None
183
 
184
  def _count_bird_species_from_desc(self, html: str) -> int:
185
+ t = html.lower()
186
  species = set()
187
  # robust matches (include common variants)
188
+ # Common Antarctic species in this video (expandable later)
189
+ if re.search(r"\bemperor\s+penguin\b", t):
190
  species.add("emperor penguin")
191
+ if re.search(r"\bad[ée]lie\s+penguin\b", t):
192
  species.add("adelie penguin")
193
+ if re.search(r"\bgiant\s+petrel\b", t) or re.search(r"\bsouthern\s+giant\s+petrel\b", t) or re.search(r"\bnorthern\s+giant\s+petrel\b", t):
 
194
  species.add("giant petrel")
195
  return len(species)
196
 
 
204
  url = m.group(0)
205
  html = self._fetch_yt_html(url)
206
  if html:
207
+ yt_text = self._extract_yt_text(html)
208
  n = self._count_bird_species_from_desc(html)
209
  if n > 0:
210
  return str(n) # EXACT MATCH wants bare number
211
  # Deterministic LLM fallback constrained to description only
212
  yt_sys = (
213
  "Return ONLY the number (digits only, no words, no punctuation). "
214
+ "Count the distinct bird species explicitly mentioned in the official video description."
215
  )
216
  raw = self._llm(f"{yt_sys}\n\nQuestion: {question}")
217
  num = _extract_bare_number(raw)
 
224
  if num is not None:
225
  return num
226
 
227
+ if html:
228
+ maybe = _extract_bare_number(yt_text if 'yt_text' in locals() else html)
229
+ if maybe:
230
+ return maybe
231
+
232
  # 1) quick math
233
  calc = self._maybe_calc(question)
234
  if calc is not None: