dalybuilds commited on
Commit
2e6655e
·
verified ·
1 Parent(s): 0b3f5e5

Update dataset_utils.py

Browse files
Files changed (1) hide show
  1. dataset_utils.py +14 -13
dataset_utils.py CHANGED
@@ -3,21 +3,22 @@
3
  from datasets import load_dataset
4
  from transformers import ViTImageProcessor
5
  from PIL import Image
 
6
 
7
- def load_insect_dataset():
8
- return load_dataset("Francesco/insects-mytwu")
 
 
 
 
 
 
 
 
 
 
9
 
10
  def preprocess_image(image_path):
11
  processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
12
  image = Image.open(image_path).convert("RGB")
13
- return processor(image, return_tensors="pt")
14
-
15
- def load_species_descriptions():
16
- dataset = load_dataset("Gharaee/BIOSCAN-5M")
17
- descriptions = {}
18
- # Limit to the first 500 records for testing
19
- for record in dataset["train"].select(range(500)):
20
- species_name = record.get("species_name", "Unknown Species")
21
- description = record.get("description", "No description available.")
22
- descriptions[species_name] = description
23
- return descriptions
 
3
  from datasets import load_dataset
4
  from transformers import ViTImageProcessor
5
  from PIL import Image
6
+ import os
7
 
8
+ class DatasetHandler:
9
+ def __init__(self, dataset_name="Gharaee/BIOSCAN-5M"):
10
+ self.dataset_name = dataset_name
11
+
12
+ def load_descriptions(self, max_records=500):
13
+ dataset = load_dataset(self.dataset_name)
14
+ descriptions = {}
15
+ for record in dataset["train"].select(range(max_records)):
16
+ species_name = record.get("species_name", "Unknown Species")
17
+ description = record.get("description", "No description available.")
18
+ descriptions[species_name] = description
19
+ return descriptions
20
 
21
  def preprocess_image(image_path):
22
  processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
23
  image = Image.open(image_path).convert("RGB")
24
+ return processor(image, return_tensors="pt")