wjbmattingly commited on
Commit
71303dd
·
verified ·
1 Parent(s): 53af227

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +19 -0
  2. app.py +438 -0
  3. requirements.txt +5 -0
  4. templates/index.html +0 -0
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Copy requirements and install dependencies
6
+ COPY requirements.txt .
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ # Download spaCy model
10
+ RUN python -m spacy download en_core_web_sm
11
+
12
+ # Copy application files
13
+ COPY . .
14
+
15
+ # Expose port for HuggingFace Spaces
16
+ EXPOSE 7860
17
+
18
+ # Run the Flask app
19
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, jsonify
2
+ import spacy
3
+ import json
4
+ import requests
5
+ from gliner import GLiNER
6
+
7
+ app = Flask(__name__)
8
+
9
+ # Load a blank English spaCy pipeline for tokenization
10
+ nlp = spacy.blank("en")
11
+
12
+ # GLiNER pipeline (will be configured on first use)
13
+ gliner_nlp = None
14
+
15
+ # GLiNER multitask model for relationships
16
+ gliner_multitask = None
17
+
18
+ def get_or_create_multitask_model():
19
+ """
20
+ Get or create GLiNER multitask model for relationship extraction
21
+ """
22
+ global gliner_multitask
23
+
24
+ if gliner_multitask is None:
25
+ try:
26
+ gliner_multitask = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
27
+ except Exception as e:
28
+ print(f"Error loading GLiNER multitask model: {e}")
29
+ return None
30
+
31
+ return gliner_multitask
32
+
33
+ @app.route('/')
34
+ def index():
35
+ return render_template('index.html')
36
+
37
+ @app.route('/tokenize', methods=['POST'])
38
+ def tokenize_text():
39
+ """
40
+ Tokenize the input text and return token boundaries
41
+ """
42
+ data = request.get_json()
43
+ text = data.get('text', '')
44
+
45
+ if not text:
46
+ return jsonify({'error': 'No text provided'}), 400
47
+
48
+ # Process text with spaCy
49
+ doc = nlp(text)
50
+
51
+ # Extract token information
52
+ tokens = []
53
+ for token in doc:
54
+ tokens.append({
55
+ 'text': token.text,
56
+ 'start': token.idx,
57
+ 'end': token.idx + len(token.text)
58
+ })
59
+
60
+ return jsonify({
61
+ 'tokens': tokens,
62
+ 'text': text
63
+ })
64
+
65
+ @app.route('/find_token_boundaries', methods=['POST'])
66
+ def find_token_boundaries():
67
+ """
68
+ Given a text selection, find the token boundaries that encompass it
69
+ """
70
+ data = request.get_json()
71
+ text = data.get('text', '')
72
+ start = data.get('start', 0)
73
+ end = data.get('end', 0)
74
+ label = data.get('label', 'UNLABELED')
75
+
76
+ if not text:
77
+ return jsonify({'error': 'No text provided'}), 400
78
+
79
+ # Process text with spaCy
80
+ doc = nlp(text)
81
+
82
+ # Find tokens that overlap with the selection
83
+ token_start = None
84
+ token_end = None
85
+
86
+ for token in doc:
87
+ # Check if token overlaps with selection
88
+ if token.idx < end and token.idx + len(token.text) > start:
89
+ if token_start is None:
90
+ token_start = token.idx
91
+ token_end = token.idx + len(token.text)
92
+
93
+ # If no tokens found, return original boundaries
94
+ if token_start is None:
95
+ token_start = start
96
+ token_end = end
97
+
98
+ return jsonify({
99
+ 'start': token_start,
100
+ 'end': token_end,
101
+ 'selected_text': text[token_start:token_end],
102
+ 'label': label
103
+ })
104
+
105
+ @app.route('/get_default_labels', methods=['GET'])
106
+ def get_default_labels():
107
+ """
108
+ Return the default annotation labels with their colors
109
+ """
110
+ default_labels = [
111
+ {'name': 'PERSON', 'color': '#fef3c7', 'border': '#f59e0b'},
112
+ {'name': 'LOCATION', 'color': '#dbeafe', 'border': '#3b82f6'},
113
+ {'name': 'ORGANIZATION', 'color': '#dcfce7', 'border': '#10b981'}
114
+ ]
115
+
116
+ return jsonify({'labels': default_labels})
117
+
118
+ @app.route('/get_default_relationship_labels', methods=['GET'])
119
+ def get_default_relationship_labels():
120
+ """
121
+ Return the default relationship labels with their colors
122
+ """
123
+ default_relationship_labels = [
124
+ {'name': 'worked at', 'color': '#fce7f3', 'border': '#ec4899'},
125
+ {'name': 'visited', 'color': '#f3e8ff', 'border': '#a855f7'}
126
+ ]
127
+
128
+ return jsonify({'relationship_labels': default_relationship_labels})
129
+
130
+ def get_or_create_gliner_pipeline(labels):
131
+ """
132
+ Get or create GLiNER pipeline with specified labels
133
+ """
134
+ global gliner_nlp
135
+
136
+ # Convert labels to lowercase for GLiNER
137
+ gliner_labels = [label.lower() for label in labels]
138
+
139
+ try:
140
+ # Create new pipeline if it doesn't exist or labels changed
141
+ custom_spacy_config = {
142
+ "gliner_model": "gliner-community/gliner_small-v2.5",
143
+ "chunk_size": 250,
144
+ "labels": gliner_labels,
145
+ "style": "ent"
146
+ }
147
+
148
+ gliner_nlp = spacy.blank("en")
149
+ gliner_nlp.add_pipe("gliner_spacy", config=custom_spacy_config)
150
+
151
+ return gliner_nlp
152
+ except Exception as e:
153
+ print(f"Error creating GLiNER pipeline: {e}")
154
+ return None
155
+
156
+ @app.route('/run_gliner', methods=['POST'])
157
+ def run_gliner():
158
+ """
159
+ Run GLiNER entity extraction on the provided text with specified labels
160
+ """
161
+ data = request.get_json()
162
+ text = data.get('text', '')
163
+ labels = data.get('labels', [])
164
+
165
+ if not text:
166
+ return jsonify({'error': 'No text provided'}), 400
167
+
168
+ if not labels:
169
+ return jsonify({'error': 'No labels provided'}), 400
170
+
171
+ try:
172
+ # Get or create GLiNER pipeline
173
+ pipeline = get_or_create_gliner_pipeline(labels)
174
+
175
+ if pipeline is None:
176
+ return jsonify({'error': 'Failed to initialize GLiNER pipeline'}), 500
177
+
178
+ # Process text with GLiNER
179
+ doc = pipeline(text)
180
+
181
+ # Extract entities with token boundaries
182
+ entities = []
183
+ for ent in doc.ents:
184
+ # Map GLiNER label back to user's label format
185
+ original_label = None
186
+ for label in labels:
187
+ if label.lower() == ent.label_.lower():
188
+ original_label = label
189
+ break
190
+
191
+ if original_label:
192
+ entities.append({
193
+ 'text': ent.text,
194
+ 'start': ent.start_char,
195
+ 'end': ent.end_char,
196
+ 'label': original_label,
197
+ 'confidence': getattr(ent, 'score', 1.0) if hasattr(ent, 'score') else 1.0
198
+ })
199
+
200
+ return jsonify({
201
+ 'entities': entities,
202
+ 'total_found': len(entities)
203
+ })
204
+
205
+ except Exception as e:
206
+ print(f"GLiNER processing error: {e}")
207
+ return jsonify({'error': f'GLiNER processing failed: {str(e)}'}), 500
208
+
209
+ @app.route('/run_gliner_relationships', methods=['POST'])
210
+ def run_gliner_relationships():
211
+ """
212
+ Run GLiNER relationship extraction on the provided text with specified relationship labels
213
+ """
214
+ data = request.get_json()
215
+ text = data.get('text', '')
216
+ relationship_labels = data.get('relationship_labels', [])
217
+ entity_labels = data.get('entity_labels', ["person", "organization", "location", "date", "place"])
218
+
219
+ if not text:
220
+ return jsonify({'error': 'No text provided'}), 400
221
+
222
+ if not relationship_labels:
223
+ return jsonify({'error': 'No relationship labels provided'}), 400
224
+
225
+ try:
226
+ # Get GLiNER multitask model
227
+ model = get_or_create_multitask_model()
228
+
229
+ if model is None:
230
+ return jsonify({'error': 'Failed to initialize GLiNER multitask model'}), 500
231
+
232
+ # First extract entities using the provided entity labels
233
+ print(f"Using entity labels: {entity_labels}")
234
+ entities = model.predict_entities(text, entity_labels, threshold=0.3)
235
+ print(entities)
236
+
237
+ # Then extract relationships using the specific format
238
+ formatted_labels = []
239
+ for label in relationship_labels:
240
+ for entity_label in entity_labels:
241
+ formatted_labels.append(f"{entity_label} <> {label}")
242
+
243
+ print(f"Formatted relationship labels: {formatted_labels}")
244
+
245
+ relation_entities = model.predict_entities(text, formatted_labels, threshold=0.3)
246
+
247
+ # Process results into relationship triplets
248
+ relationships = []
249
+
250
+ # Group relation entities by their relation type and try to find entity pairs
251
+ for rel_entity in relation_entities:
252
+ print(rel_entity)
253
+ label_parts = rel_entity['label'].split(' <> ')
254
+ if len(label_parts) == 2:
255
+ entity_type, relation_type = label_parts
256
+
257
+ # Find potential subject and object entities near this relation
258
+ rel_start = rel_entity['start']
259
+ rel_end = rel_entity['end']
260
+
261
+ # Look for entities before and after the relation mention
262
+ subject_candidates = [e for e in entities if e['end'] <= rel_start and abs(e['end'] - rel_start) < 100]
263
+ object_candidates = [e for e in entities if e['start'] >= rel_end and abs(e['start'] - rel_end) < 100]
264
+
265
+ # Also look for entities that contain or are contained by the relation text
266
+ overlapping_entities = [e for e in entities if
267
+ (e['start'] <= rel_start and e['end'] >= rel_end) or # entity contains relation
268
+ (rel_start <= e['start'] and rel_end >= e['end']) # relation contains entity
269
+ ]
270
+
271
+ if subject_candidates and object_candidates:
272
+ # Take the closest entities
273
+ subject = max(subject_candidates, key=lambda x: x['end'])
274
+ object_entity = min(object_candidates, key=lambda x: x['start'])
275
+
276
+ relationships.append({
277
+ 'subject': subject['text'],
278
+ 'subject_start': subject['start'],
279
+ 'subject_end': subject['end'],
280
+ 'relation_type': relation_type,
281
+ 'relation_text': rel_entity['text'],
282
+ 'relation_start': rel_entity['start'],
283
+ 'relation_end': rel_entity['end'],
284
+ 'object': object_entity['text'],
285
+ 'object_start': object_entity['start'],
286
+ 'object_end': object_entity['end'],
287
+ 'confidence': rel_entity['score'],
288
+ 'full_text': f"{subject['text']} {relation_type} {object_entity['text']}"
289
+ })
290
+ elif overlapping_entities:
291
+ # Handle cases where the relation text spans or overlaps with entities
292
+ for ent in overlapping_entities:
293
+ relationships.append({
294
+ 'subject': ent['text'],
295
+ 'subject_start': ent['start'],
296
+ 'subject_end': ent['end'],
297
+ 'relation_type': relation_type,
298
+ 'relation_text': rel_entity['text'],
299
+ 'relation_start': rel_entity['start'],
300
+ 'relation_end': rel_entity['end'],
301
+ 'object': '', # Will be filled by user or further processing
302
+ 'object_start': -1,
303
+ 'object_end': -1,
304
+ 'confidence': rel_entity['score'],
305
+ 'full_text': f"{ent['text']} {relation_type} [object]"
306
+ })
307
+
308
+ return jsonify({
309
+ 'relationships': relationships,
310
+ 'total_found': len(relationships)
311
+ })
312
+
313
+ except Exception as e:
314
+ print(f"GLiNER relationship processing error: {e}")
315
+ return jsonify({'error': f'GLiNER relationship processing failed: {str(e)}'}), 500
316
+
317
+ @app.route('/search_wikidata', methods=['POST'])
318
+ def search_wikidata():
319
+ """
320
+ Search Wikidata for entities matching the query
321
+ """
322
+ data = request.get_json()
323
+ query = data.get('query', '').strip()
324
+ limit = data.get('limit', 10)
325
+
326
+ if not query:
327
+ return jsonify({'error': 'No query provided'}), 400
328
+
329
+ try:
330
+ # Wikidata search API endpoint
331
+ url = 'https://www.wikidata.org/w/api.php'
332
+
333
+ params = {
334
+ 'action': 'wbsearchentities',
335
+ 'search': query,
336
+ 'language': 'en',
337
+ 'format': 'json',
338
+ 'limit': limit,
339
+ 'type': 'item'
340
+ }
341
+
342
+ headers = {
343
+ 'User-Agent': 'AnnotationTool/1.0 (https://github.com/user/annotation-tool) Python/requests'
344
+ }
345
+
346
+ response = requests.get(url, params=params, headers=headers, timeout=10)
347
+ response.raise_for_status()
348
+
349
+ data = response.json()
350
+
351
+ # Extract relevant information
352
+ results = []
353
+ if 'search' in data:
354
+ for item in data['search']:
355
+ result = {
356
+ 'id': item.get('id', ''),
357
+ 'label': item.get('label', ''),
358
+ 'description': item.get('description', ''),
359
+ 'url': f"https://www.wikidata.org/wiki/{item.get('id', '')}"
360
+ }
361
+ results.append(result)
362
+
363
+ return jsonify({
364
+ 'results': results,
365
+ 'total': len(results)
366
+ })
367
+
368
+ except requests.exceptions.RequestException as e:
369
+ print(f"Wikidata API error: {e}")
370
+ return jsonify({'error': 'Failed to search Wikidata'}), 500
371
+ except Exception as e:
372
+ print(f"Wikidata search error: {e}")
373
+ return jsonify({'error': f'Search failed: {str(e)}'}), 500
374
+
375
+ @app.route('/get_wikidata_entity', methods=['POST'])
376
+ def get_wikidata_entity():
377
+ """
378
+ Get Wikidata entity information by Q-code
379
+ """
380
+ data = request.get_json()
381
+ qcode = data.get('qcode', '').strip()
382
+
383
+ if not qcode:
384
+ return jsonify({'error': 'No Q-code provided'}), 400
385
+
386
+ # Ensure Q-code format
387
+ if not qcode.startswith('Q'):
388
+ qcode = 'Q' + qcode.lstrip('Q')
389
+
390
+ try:
391
+ # Wikidata entity API endpoint
392
+ url = 'https://www.wikidata.org/w/api.php'
393
+
394
+ params = {
395
+ 'action': 'wbgetentities',
396
+ 'ids': qcode,
397
+ 'languages': 'en',
398
+ 'format': 'json'
399
+ }
400
+
401
+ headers = {
402
+ 'User-Agent': 'AnnotationTool/1.0 (https://github.com/user/annotation-tool) Python/requests'
403
+ }
404
+
405
+ response = requests.get(url, params=params, headers=headers, timeout=10)
406
+ response.raise_for_status()
407
+
408
+ data = response.json()
409
+
410
+ if 'entities' in data and qcode in data['entities']:
411
+ entity = data['entities'][qcode]
412
+
413
+ if 'missing' in entity:
414
+ return jsonify({'error': f'Entity {qcode} not found'}), 404
415
+
416
+ # Extract information
417
+ result = {
418
+ 'id': qcode,
419
+ 'label': entity.get('labels', {}).get('en', {}).get('value', ''),
420
+ 'description': entity.get('descriptions', {}).get('en', {}).get('value', ''),
421
+ 'url': f"https://www.wikidata.org/wiki/{qcode}"
422
+ }
423
+
424
+ return jsonify({'entity': result})
425
+ else:
426
+ return jsonify({'error': f'Entity {qcode} not found'}), 404
427
+
428
+ except requests.exceptions.RequestException as e:
429
+ print(f"Wikidata API error: {e}")
430
+ return jsonify({'error': 'Failed to get Wikidata entity'}), 500
431
+ except Exception as e:
432
+ print(f"Wikidata entity error: {e}")
433
+ return jsonify({'error': f'Request failed: {str(e)}'}), 500
434
+
435
+ if __name__ == '__main__':
436
+ import os
437
+ port = int(os.environ.get('PORT', 7860))
438
+ app.run(host='0.0.0.0', port=port, debug=False)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Flask
2
+ spacy>=3.0.0
3
+ gliner-spacy
4
+ gliner
5
+ requests
templates/index.html ADDED
The diff for this file is too large to render. See raw diff