Spaces:
Sleeping
Sleeping
Update dataset_utils.py
Browse files- dataset_utils.py +14 -13
dataset_utils.py
CHANGED
|
@@ -3,21 +3,22 @@
|
|
| 3 |
from datasets import load_dataset
|
| 4 |
from transformers import ViTImageProcessor
|
| 5 |
from PIL import Image
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def preprocess_image(image_path):
|
| 11 |
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
| 12 |
image = Image.open(image_path).convert("RGB")
|
| 13 |
-
return processor(image, return_tensors="pt")
|
| 14 |
-
|
| 15 |
-
def load_species_descriptions():
|
| 16 |
-
dataset = load_dataset("Gharaee/BIOSCAN-5M")
|
| 17 |
-
descriptions = {}
|
| 18 |
-
# Limit to the first 500 records for testing
|
| 19 |
-
for record in dataset["train"].select(range(500)):
|
| 20 |
-
species_name = record.get("species_name", "Unknown Species")
|
| 21 |
-
description = record.get("description", "No description available.")
|
| 22 |
-
descriptions[species_name] = description
|
| 23 |
-
return descriptions
|
|
|
|
| 3 |
from datasets import load_dataset
|
| 4 |
from transformers import ViTImageProcessor
|
| 5 |
from PIL import Image
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
+
class DatasetHandler:
|
| 9 |
+
def __init__(self, dataset_name="Gharaee/BIOSCAN-5M"):
|
| 10 |
+
self.dataset_name = dataset_name
|
| 11 |
+
|
| 12 |
+
def load_descriptions(self, max_records=500):
|
| 13 |
+
dataset = load_dataset(self.dataset_name)
|
| 14 |
+
descriptions = {}
|
| 15 |
+
for record in dataset["train"].select(range(max_records)):
|
| 16 |
+
species_name = record.get("species_name", "Unknown Species")
|
| 17 |
+
description = record.get("description", "No description available.")
|
| 18 |
+
descriptions[species_name] = description
|
| 19 |
+
return descriptions
|
| 20 |
|
| 21 |
def preprocess_image(image_path):
|
| 22 |
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
| 23 |
image = Image.open(image_path).convert("RGB")
|
| 24 |
+
return processor(image, return_tensors="pt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|