Spaces:
Sleeping
Sleeping
| from ast import literal_eval | |
| from functools import lru_cache | |
| from itertools import combinations | |
| from pathlib import Path | |
| from typing import List, Optional, Union | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from cytoolz import concat, frequencies, topk, unique | |
| from datasets import load_dataset | |
| pd.options.plotting.backend = "plotly" | |
| def download_dataset(): | |
| return load_dataset( | |
| "open-source-metrics/model-repos-stats", | |
| split="train", | |
| ignore_verifications=True, | |
| ) | |
| def _clean_tags(tags: Optional[Union[str, List[str]]]): | |
| try: | |
| tags = literal_eval(tags) | |
| if isinstance(tags, str): | |
| return [tags] | |
| if isinstance(tags, list): | |
| return [tag for tag in tags if isinstance(tag, str)] | |
| else: | |
| return [] | |
| except (ValueError, SyntaxError): | |
| return [] | |
| def _is_generated_from_tag(tags): | |
| return any("generated" in tag for tag in tags) | |
| def _parse_tags_for_generated(tags): | |
| for tag in tags: | |
| if "generated" in tag: | |
| return tag | |
| def prep_dataset(): | |
| ds = download_dataset() | |
| df = ds.to_pandas() | |
| df["languages"] = df["languages"].apply(_clean_tags) | |
| df["datasets"] = df["datasets"].apply(_clean_tags) | |
| df["tags"] = df["tags"].apply(_clean_tags) | |
| df["has_languages"] = df.languages.apply(len) > 0 | |
| df["has_tags"] = df.tags.apply(len) > 0 | |
| df["has_dataset"] = df.datasets.apply(len) > 0 | |
| df["has_co2"] = df.co2.notnull() | |
| df["has_co2"] = df.co2.apply(lambda x: x is not None) | |
| df["has_license"] = df.license.notnull() | |
| df["is_generated"] = df.tags.apply(_is_generated_from_tag) | |
| df = df.drop(columns=["Unnamed: 0"]) | |
| df.to_parquet("data.parquet") | |
| return df | |
| def load_data(): | |
| return ( | |
| pd.read_parquet("data.parquet") | |
| if Path("data.parquet").exists() | |
| else prep_dataset() | |
| ) | |
| def filter_df_by_library(filter="transformers"): | |
| df = load_data() | |
| return df[df["library"] == filter] if filter else df | |
| def get_library_choices(min_freq: int = 50): | |
| df = load_data() | |
| library_counts = df.library.value_counts() | |
| return library_counts[library_counts > min_freq].index.to_list() | |
| def get_all_tags(): | |
| df = load_data() | |
| tags = df["tags"].to_list() | |
| return list(concat(tags)) | |
| def get_case_sensitive_duplicate_tags(): | |
| tags = get_all_tags() | |
| unique_tags = unique(tags) | |
| return [ | |
| tag_combo | |
| for tag_combo in combinations(unique_tags, 2) | |
| if tag_combo[0].lower() == tag_combo[1].lower() | |
| ] | |
| def display_case_sensitive_duplicate_tags(): | |
| return pd.DataFrame(get_case_sensitive_duplicate_tags()) | |
| def get_number_of_tags(case_sensitive=True): | |
| tags = set(get_all_tags()) | |
| if case_sensitive: | |
| return f"Total number of case sensitive tags: {len(tags)}" | |
| tags = {tag.lower() for tag in tags} | |
| return f"Total number of case insensitive tags: {len(tags)}" | |
| def tag_frequency(case_sensitive=True): | |
| tags = get_all_tags() | |
| if not case_sensitive: | |
| tags = (tag.lower() for tag in tags) | |
| tags_frequencies = dict(frequencies(tags)) | |
| df = pd.DataFrame.from_dict( | |
| tags_frequencies, orient="index", columns=["Count"] | |
| ).sort_values(by="Count", ascending=False) | |
| return df.reset_index() | |
| def tag_frequency_by_library(library_filter): | |
| df = filter_df_by_library(library_filter) | |
| tags = concat(df["tags"]) | |
| tags = dict(frequencies(tags)) | |
| df = pd.DataFrame.from_dict(tags, orient="index", columns=["Count"]).sort_values( | |
| by="Count", ascending=False | |
| ) | |
| return df.reset_index() | |
| def has_model_card_by_library(top_n): | |
| df = load_data() | |
| if top_n: | |
| top_libs = df.library.value_counts().head(int(top_n)).index.to_list() | |
| # min_thresh = df.library.value_counts()[:min_number].index.to_list() | |
| df = df[df.library.isin(top_libs)] | |
| return ( | |
| df.groupby("library")["has_text"] | |
| .apply(lambda x: np.sum(x) / len(x)) | |
| .sort_values() | |
| .plot.barh() | |
| ) | |
| def model_card_length_by_library(top_n): | |
| df = load_data() | |
| if top_n: | |
| top_libs = df.library.value_counts().head(int(top_n)).index.to_list() | |
| # min_thresh = df.library.value_counts()[:min_number].index.to_list() | |
| df = df[df.library.isin(top_libs)] | |
| return df.groupby("library")["text_length"].describe().round().reset_index() | |
| # df = df.groupby('library')['text_length'].describe().round().reset_index() | |
| # df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})") | |
| # return df.to_markdown() | |
| def metadata_coverage_by_library(metadata_field): | |
| df = load_data() | |
| return df.groupby("library")[metadata_field].mean().sort_values().plot.barh() | |
| def metatadata_coverage_autogenerated_vs_test(): | |
| df = load_data() | |
| subset_df = df[df["is_generated"]].copy(deep=True) | |
| subset_df.reset_index() | |
| return ( | |
| df.groupby("is_generated")[[c for c in df.columns if c.startswith("has")]] | |
| .mean() | |
| .transpose() | |
| .round(6) | |
| .reset_index() | |
| .rename( | |
| columns={ | |
| True: "From autogenerated", | |
| False: "Not autogenerated", | |
| "index": "Metadata/tag field", | |
| } | |
| ) | |
| ) | |
| def metadata_coverage_by_autogenerated(metadata_field): | |
| df = load_data() | |
| subset_df = df[df["is_generated"]].copy(deep=True) | |
| subset_df.reset_index() | |
| subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated) | |
| return ( | |
| subset_df.groupby("autogenerated-from")[metadata_field] | |
| .mean() | |
| .sort_values() | |
| .plot.barh() | |
| ) | |
| def model_card_length_by_autogenerated(): | |
| df = load_data() | |
| subset_df = df[df["is_generated"]].copy(deep=True) | |
| subset_df.reset_index() | |
| subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated) | |
| return ( | |
| subset_df.groupby("autogenerated-from")["text_length"] | |
| .describe() | |
| .round() | |
| .reset_index() | |
| ) | |
| _ABSTRACT = """ | |
| tl;dr this dashboard aims to provide an overview of metadata associated with models hosted on the Hugging Face hub. | |
| \n | |
| Each tab of this dashboard focuses on a different aspect of model metadata on the hub. | |
| Many of the tabs in the dashboard have a particular focus on the metadata coverage for different libraries in the hub. | |
| """ | |
| df = load_data() | |
| top_n = df.library.value_counts().shape[0] | |
| libraries = [library for library in df.library.unique() if library] | |
| metadata_coverage_columns = [c for c in df.columns if c.startswith("has")] | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π€ Hub Metadata Explorer") | |
| gr.Markdown(_ABSTRACT) | |
| with gr.Tab("Tag frequencies"): | |
| gr.Markdown( | |
| "Tags are one of the key ways in which users may identify models which are of interest. This tab provides " | |
| "some visualizations of tags across *all* models (regardless of library)" | |
| ) | |
| with gr.Row(): | |
| gr.Markdown( | |
| "The accordian below allows you to see the top tags for models on the hub (optionally making " | |
| "tags case insensitive" | |
| ) | |
| with gr.Row(): | |
| case_sensitive = gr.Checkbox( | |
| True, | |
| label="case sensitive", | |
| ) | |
| mk = gr.Markdown() | |
| case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False) | |
| with gr.Accordion("Tag Frequencies", open=False): | |
| df = gr.Dataframe() | |
| case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False) | |
| with gr.Row(): | |
| gr.Markdown( | |
| "Some tags are currently used with in cased or uncased forms i.e. 'translation' vs 'Translation'" | |
| ) | |
| with gr.Row(): | |
| gr.Markdown( | |
| f"Number of tags which are currently case sensitive {len(get_case_sensitive_duplicate_tags())}" | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("View case sensitive tag pairs", open=False): | |
| gr.Dataframe(display_case_sensitive_duplicate_tags()) | |
| with gr.Tab("Tags frequencies by library"): | |
| gr.Markdown( | |
| "The π€ hub hosts models from a wide range of machine learning libraries. These libraries use tags in " | |
| "slightly different ways. The table below gives a breakdown of the most frequent tags for each library." | |
| ) | |
| library_choice = gr.Dropdown(choices=libraries, label="select library") | |
| df = gr.Dataframe() | |
| library_choice.change( | |
| tag_frequency_by_library, [library_choice], df, queue=False | |
| ) | |
| with gr.Tab("Metadata coverage by library"): | |
| gr.Markdown( | |
| "Libraries hosting models on the Hugging Face hub take different approaches to " | |
| "metadata i.e. some libraries automatically generate metadata for a model at the end of a " | |
| "training run. These libraries may also have different types of users who take differing " | |
| "approaches to creating metadata for models they share on the hub. The below chart allows you to " | |
| "see which libraries have better coverage for key areas of model metadata. " | |
| ) | |
| metadata_field = gr.Dropdown(choices=metadata_coverage_columns) | |
| plot = gr.Plot() | |
| metadata_field.change( | |
| metadata_coverage_by_library, [metadata_field], plot, queue=False | |
| ) | |
| with gr.Tab("Auto generated model cards"): | |
| gr.Markdown( | |
| "Some libraries/training frameworks automatically generate a model card when pushing models to " | |
| "the hub. The below dataframe compares the metadata coverage across several tags for models " | |
| "which are pushed with autogenerated model cards compared to those without. " | |
| "" | |
| "**Note** this " | |
| "breakdown relies on tags with `autogenerated` in them." | |
| "As a result some model cards might be in the wrong category. " | |
| ) | |
| gr.Dataframe(metatadata_coverage_autogenerated_vs_test()) | |
| with gr.Row(): | |
| metadata_field = gr.Dropdown(choices=metadata_coverage_columns) | |
| plot = gr.Plot() | |
| metadata_field.change( | |
| metadata_coverage_by_autogenerated, [metadata_field], plot, queue=False | |
| ) | |
| # ) | |
| # with gr.Row(): | |
| # | |
| # # with gr.Column(): | |
| # # plot = gr.Plot() | |
| # # min_lib_frequency.change( | |
| # # model_card_length_by_autogenerated, [min_lib_frequency], plot, queue=False | |
| # # ) | |
| # with gr.Column(): | |
| # gr.Markdown("Mean length of model card for autogenerated_from * model cards") | |
| # df = gr.Dataframe(model_card_length_by_autogenerated) | |
| with gr.Tab("Model Cards"): | |
| gr.Markdown( | |
| """Model cards are a key component of metadata for a model. Model cards can include both | |
| information created by a human i.e. outlining the goals behind the creation of the model and information | |
| created by a training framework. This automatically generated information can contain information about | |
| number of epochs, learning rate, weight decay etc. """ | |
| ) | |
| min_lib_frequency = gr.Slider( | |
| minimum=1, maximum=top_n, value=10, label="filter by top n libraries" | |
| ) | |
| with gr.Column(): | |
| plot = gr.Plot() | |
| min_lib_frequency.change( | |
| has_model_card_by_library, [min_lib_frequency], plot, queue=False | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("Mean length of model card by library") | |
| df = gr.Dataframe() | |
| min_lib_frequency.change( | |
| model_card_length_by_library, [min_lib_frequency], df, queue=False | |
| ) | |
| demo.launch() | |