Update README.md and config.json, and add aragpt2-large model to ARAGPT2_PRETRAINED_MODEL_ARCHIVE_LIST

Browse files

Files changed (3) hide show

README.md +16 -13
config.json +2 -1
modeling_aragpt2.py +1 -0

README.md CHANGED Viewed

@@ -6,9 +6,9 @@ license_link: https://github.com/aub-mind/arabert/blob/master/aragpt2/LICENSE
 datasets:
 - wikipedia
 - Osian
-- 1.5B-Arabic-Corpus
-- oscar-arabic-unshuffled
-- Assafir(private)
 inference: false
 widget:
  - text: "يحكى أن مزارعا مخادعا قام ببيع بئر الماء الموجود في أرضه لجاره مقابل مبلغ كبير من المال"
@@ -18,6 +18,8 @@ widget:
 # Arabic GPT2
 <img src="https://raw.githubusercontent.com/aub-mind/arabert/master/AraGPT2.png" width="100" align="left"/>
 You can find more information in our paper [AraGPT2](https://arxiv.org/abs/2012.15520)
@@ -32,18 +34,17 @@ Both models are trained using the `adafactor` optimizer, since the `adam` and `l
 AraGPT2 is trained on the same large Arabic Dataset as AraBERTv2.
-# Usage
 ## Testing the model using `transformers`:
-You need to use the GPT2LMHeadModel from `arabert`: `pip install arabert`
 ```python
-from transformers import GPT2TokenizerFast, pipeline
-#for base and medium
-from transformers import GPT2LMHeadModel
-#for large and mega
-from arabert.aragpt2.grover.modeling_gpt2 import GPT2LMHeadModel
 from arabert.preprocess import ArabertPreprocessor
@@ -53,13 +54,15 @@ arabert_prep = ArabertPreprocessor(model_name=MODEL_NAME)
 text=""
 text_clean = arabert_prep.preprocess(text)
-model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
 tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
-generation_pipeline = pipeline("text-generation",model=model,tokenizer=tokenizer)
 #feel free to try different decoding settings
 generation_pipeline(text,
-    pad_token_id=tokenizer.eos_token_id,
     num_beams=10,
     max_length=200,
     top_p=0.9,

 datasets:
 - wikipedia
 - Osian
+- arabic-billion-words
+- oscar
+- Assafir-private
 inference: false
 widget:
  - text: "يحكى أن مزارعا مخادعا قام ببيع بئر الماء الموجود في أرضه لجاره مقابل مبلغ كبير من المال"
 # Arabic GPT2
 <img src="https://raw.githubusercontent.com/aub-mind/arabert/master/AraGPT2.png" width="100" align="left"/>
 You can find more information in our paper [AraGPT2](https://arxiv.org/abs/2012.15520)
 AraGPT2 is trained on the same large Arabic Dataset as AraBERTv2.
+# NOTE: The model expects the input to be preprocessed using the `arabert` library.
+if not the model won't be able to generate the correct output.
 ## Testing the model using `transformers`:
+The model code is now hosted on HuggingFace so you need to use the `trust_remote_code` flag, and can be used as follows:
 ```python
+from transformers import AutoModelForCausalLM, pipeline
 from arabert.preprocess import ArabertPreprocessor
 text=""
 text_clean = arabert_prep.preprocess(text)
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
 tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
+generation_pipeline = pipeline(
+    "text-generation", model=MODEL_NAME, trust_remote_code=True
+)
 #feel free to try different decoding settings
 generation_pipeline(text,
+    pad_token_id=pipeline.tokenizer.eos_token_id,
     num_beams=10,
     max_length=200,
     top_p=0.9,

config.json CHANGED Viewed

@@ -41,5 +41,6 @@
       "no_repeat_ngram_size": 3
     }
   },
-  "vocab_size": 64000
 }

       "no_repeat_ngram_size": 3
     }
   },
+  "vocab_size": 64000,
+  "tokenizer_class": "GPT2Tokenizer"
 }

modeling_aragpt2.py CHANGED Viewed

@@ -46,6 +46,7 @@ _CONFIG_FOR_DOC = "AraGPT2Config"
 _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 ARAGPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "aubmindlab/aragpt2-mega",
     # See all AraGPT2 models at https://huggingface.co/models?filter=aragpt2
 ]

 _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 ARAGPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "aubmindlab/aragpt2-large",
     "aubmindlab/aragpt2-mega",
     # See all AraGPT2 models at https://huggingface.co/models?filter=aragpt2
 ]