Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| import re | |
| import pandas as pd | |
| # Load the model and tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("nebiyu29/fintunned-v2-roberta_GA") | |
| model = AutoModelForSequenceClassification.from_pretrained("nebiyu29/fintunned-v2-roberta_GA") | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model = model.to(device) | |
| # Define a function to split a text into segments of 512 tokens | |
| def split_text(text): | |
| text=re.sub(r'[^a-zA-Z\s]','',text) | |
| text=str(text) | |
| # Tokenize the text | |
| tokens = tokenizer.tokenize(text) | |
| # Initialize an empty list for segments | |
| segments = [] | |
| # Initialize an empty list for current segment | |
| current_segment = [] | |
| # Initialize a counter for tokens | |
| token_count = 0 | |
| # Loop through the tokens | |
| for token in tokens: | |
| # Add the token to the current segment | |
| current_segment.append(token) | |
| # Increment the token count | |
| token_count += 1 | |
| # If the token count reaches 512 or the end of the text, add the current segment to the segments list | |
| if token_count == 512 or token == tokens[-1]: | |
| # Convert the current segment to a string and add it to the segments list | |
| segments.append(tokenizer.convert_tokens_to_string(current_segment)) | |
| # Reset the current segment and the token count | |
| current_segment = [] | |
| token_count = 0 | |
| # Return the segments list | |
| return segments | |
| def classify(text): | |
| # Define the labels | |
| labels = ["depression", "anxiety", "bipolar disorder", "schizophrenia", "PTSD", "OCD", "ADHD", "autism", "eating disorder", "personality disorder", "phobia"] | |
| #labels=list(model.config.id2label) | |
| # Encode the labels | |
| label_encodings = tokenizer(labels, padding=True, return_tensors="pt") | |
| # Split the text into segments | |
| segments = split_text(text) | |
| # Initialize an empty list for logits | |
| logits_list = [] | |
| # Loop through the segments | |
| for segment in segments: | |
| # Encode the segment and the labels | |
| inputs = tokenizer([segment] + labels, padding=True, return_tensors="pt") | |
| # Get the input ids and attention mask | |
| input_ids = inputs["input_ids"] | |
| attention_mask = inputs["attention_mask"] | |
| # Move the input ids and attention mask to the device | |
| input_ids = input_ids.to(device) | |
| attention_mask = attention_mask.to(device) | |
| # Get the model outputs for each segment | |
| with torch.no_grad(): | |
| outputs = model( | |
| input_ids, | |
| attention_mask=attention_mask, | |
| ) | |
| # Get the logits for each segment and append them to the logits list | |
| logits = outputs.logits | |
| logits_list.append(logits) | |
| # Average the logits across the segments | |
| avg_logits = torch.mean(torch.stack(logits_list), dim=0) | |
| # Apply softmax to convert logits to probabilities | |
| probabilities = torch.softmax(avg_logits, dim=1) | |
| # Get the probabilities for each label | |
| label_probabilities = probabilities[:, :len(labels)].tolist() | |
| # Get the top 3 most likely labels and their probabilities | |
| # Get the top 3 most likely labels and their probabilities | |
| top_labels = [] | |
| top_probabilities = [] | |
| label_probabilities = label_probabilities[0] # Extract the list of probabilities for the first (and only) example | |
| for _ in range(3): | |
| max_prob_index = label_probabilities.index(max(label_probabilities)) | |
| top_labels.append(labels[max_prob_index]) | |
| top_probabilities.append(max(label_probabilities)) | |
| label_probabilities[max_prob_index] = 0 # Set the max probability to 0 to get the next highest probability | |
| # Create a dictionary to store the results | |
| results = { | |
| "sequence": text, | |
| "top_labels": top_labels, | |
| "top_probabilities": top_probabilities | |
| } | |
| return results | |
| # Streamlit app | |
| st.title("Text Classification.") | |
| st.write("Enter some text, and the model will classify it.") | |
| text_input = st.text_input("Text Input") | |
| #if st.button("Classify"): | |
| predictions = classify(text_input) | |
| labels_str=",".join(predictions["top_labels"]) | |
| probs_ints=",".join(map(str,predictions["top_probabilities"])) | |
| #df=pd.DataFrame({'probabilities: ',probs_ints}) | |
| #formated_df=df.styled.format("{:.2f}").to_dict('list') | |
| #for prediction in predictions: | |
| # st.write(f"Segment Text: {prediction['segment_text']}") | |
| st.write(f"Label: {labels_str}") | |
| st.write(f"Probability: {probs_ints}") |