Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | 
         @@ -20,7 +20,7 @@ library_name: safe-rlhf 
     | 
|
| 20 | 
         | 
| 21 | 
         
             
            ## Model Details
         
     | 
| 22 | 
         | 
| 23 | 
         
            -
            The Beaver  
     | 
| 24 | 
         
             
            It can play a role in the safe RLHF algorithm, helping the Beaver model become more safe and harmless.
         
     | 
| 25 | 
         | 
| 26 | 
         
             
            - **Developed by:** the [PKU-Alignment](https://github.com/PKU-Alignment) Team.
         
     | 
| 
         @@ -36,16 +36,17 @@ It can play a role in the safe RLHF algorithm, helping the Beaver model become m 
     | 
|
| 36 | 
         
             
            - **Reward Model:** <https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward>
         
     | 
| 37 | 
         
             
            - **Cost Model:** <https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-cost>
         
     | 
| 38 | 
         
             
            - **Dataset Paper:** <https://arxiv.org/abs/2307.04657>
         
     | 
| 39 | 
         
            -
            - **Paper:**  
     | 
| 40 | 
         | 
| 41 | 
         
             
            ## How to Use the Cost Model
         
     | 
| 42 | 
         | 
| 43 | 
         
             
            ```python
         
     | 
| 
         | 
|
| 44 | 
         
             
            from transformers import AutoTokenizer
         
     | 
| 45 | 
         
             
            from safe_rlhf.models import AutoModelForScore
         
     | 
| 46 | 
         | 
| 47 | 
         
            -
            model = AutoModelForScore.from_pretrained('PKU-Alignment/beaver-7b-v1.0-cost', device_map='auto')
         
     | 
| 48 | 
         
            -
            tokenizer = AutoTokenizer.from_pretrained('PKU-Alignment/beaver-7b-v1.0-cost' 
     | 
| 49 | 
         | 
| 50 | 
         
             
            input = 'BEGINNING OF CONVERSATION: USER: hello ASSISTANT:Hello! How can I help you today?'
         
     | 
| 51 | 
         | 
| 
         @@ -54,34 +55,45 @@ output = model(**input_ids) 
     | 
|
| 54 | 
         
             
            print(output)
         
     | 
| 55 | 
         | 
| 56 | 
         
             
            # ScoreModelOutput(
         
     | 
| 57 | 
         
            -
            #     scores=tensor([[[- 
     | 
| 58 | 
         
            -
            # 
     | 
| 59 | 
         
            -
            # 
     | 
| 60 | 
         
            -
            # 
     | 
| 61 | 
         
            -
            # 
     | 
| 62 | 
         
            -
            # 
     | 
| 63 | 
         
            -
            # 
     | 
| 64 | 
         
            -
            # 
     | 
| 65 | 
         
            -
            # 
     | 
| 66 | 
         
            -
            # 
     | 
| 67 | 
         
            -
            # 
     | 
| 68 | 
         
            -
            # 
     | 
| 69 | 
         
            -
            # 
     | 
| 70 | 
         
            -
            # 
     | 
| 71 | 
         
            -
            # 
     | 
| 72 | 
         
            -
            # 
     | 
| 73 | 
         
            -
            # 
     | 
| 74 | 
         
            -
            # 
     | 
| 75 | 
         
            -
            # 
     | 
| 76 | 
         
            -
            # 
     | 
| 77 | 
         
            -
            # 
     | 
| 78 | 
         
            -
            # 
     | 
| 79 | 
         
            -
            # 
     | 
| 80 | 
         
            -
            # 
     | 
| 81 | 
         
            -
            # 
     | 
| 82 | 
         
            -
            # 
     | 
| 83 | 
         
            -
            # 
     | 
| 84 | 
         
            -
            # 
     | 
| 85 | 
         
            -
            #     end_scores=tensor([[- 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 86 | 
         
             
            # )
         
     | 
| 87 | 
         
            -
            ```
         
     | 
| 
         | 
|
| 20 | 
         | 
| 21 | 
         
             
            ## Model Details
         
     | 
| 22 | 
         | 
| 23 | 
         
            +
            The Beaver cost model is a preference model trained using the [PKU-SafeRLHF](https://huggingface.co/datasets/PKU-Alignment/PKU-SafeRLHF) dataset.
         
     | 
| 24 | 
         
             
            It can play a role in the safe RLHF algorithm, helping the Beaver model become more safe and harmless.
         
     | 
| 25 | 
         | 
| 26 | 
         
             
            - **Developed by:** the [PKU-Alignment](https://github.com/PKU-Alignment) Team.
         
     | 
| 
         | 
|
| 36 | 
         
             
            - **Reward Model:** <https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward>
         
     | 
| 37 | 
         
             
            - **Cost Model:** <https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-cost>
         
     | 
| 38 | 
         
             
            - **Dataset Paper:** <https://arxiv.org/abs/2307.04657>
         
     | 
| 39 | 
         
            +
            - **Paper:** <https://arxiv.org/abs/2310.12773>
         
     | 
| 40 | 
         | 
| 41 | 
         
             
            ## How to Use the Cost Model
         
     | 
| 42 | 
         | 
| 43 | 
         
             
            ```python
         
     | 
| 44 | 
         
            +
            import torch
         
     | 
| 45 | 
         
             
            from transformers import AutoTokenizer
         
     | 
| 46 | 
         
             
            from safe_rlhf.models import AutoModelForScore
         
     | 
| 47 | 
         | 
| 48 | 
         
            +
            model = AutoModelForScore.from_pretrained('PKU-Alignment/beaver-7b-v1.0-cost', torch_dtype=torch.bfloat16, device_map='auto')
         
     | 
| 49 | 
         
            +
            tokenizer = AutoTokenizer.from_pretrained('PKU-Alignment/beaver-7b-v1.0-cost')
         
     | 
| 50 | 
         | 
| 51 | 
         
             
            input = 'BEGINNING OF CONVERSATION: USER: hello ASSISTANT:Hello! How can I help you today?'
         
     | 
| 52 | 
         | 
| 
         | 
|
| 55 | 
         
             
            print(output)
         
     | 
| 56 | 
         | 
| 57 | 
         
             
            # ScoreModelOutput(
         
     | 
| 58 | 
         
            +
            #     scores=tensor([[[ -9.4375],
         
     | 
| 59 | 
         
            +
            #          [ -2.5156],
         
     | 
| 60 | 
         
            +
            #          [ -2.6562],
         
     | 
| 61 | 
         
            +
            #          [ -2.3594],
         
     | 
| 62 | 
         
            +
            #          [ -1.9375],
         
     | 
| 63 | 
         
            +
            #          [ -2.5781],
         
     | 
| 64 | 
         
            +
            #          [ -1.4766],
         
     | 
| 65 | 
         
            +
            #          [ -1.9922],
         
     | 
| 66 | 
         
            +
            #          [ -2.6562],
         
     | 
| 67 | 
         
            +
            #          [ -3.8125],
         
     | 
| 68 | 
         
            +
            #          [ -2.9844],
         
     | 
| 69 | 
         
            +
            #          [ -4.1875],
         
     | 
| 70 | 
         
            +
            #          [ -3.5938],
         
     | 
| 71 | 
         
            +
            #          [ -4.6562],
         
     | 
| 72 | 
         
            +
            #          [ -4.0000],
         
     | 
| 73 | 
         
            +
            #          [ -3.3438],
         
     | 
| 74 | 
         
            +
            #          [ -4.5625],
         
     | 
| 75 | 
         
            +
            #          [ -4.8438],
         
     | 
| 76 | 
         
            +
            #          [ -5.1875],
         
     | 
| 77 | 
         
            +
            #          [ -8.0000],
         
     | 
| 78 | 
         
            +
            #          [ -8.4375],
         
     | 
| 79 | 
         
            +
            #          [-10.5000],
         
     | 
| 80 | 
         
            +
            #          [-10.5000],
         
     | 
| 81 | 
         
            +
            #          [ -8.8750],
         
     | 
| 82 | 
         
            +
            #          [-10.1250],
         
     | 
| 83 | 
         
            +
            #          [-10.2500],
         
     | 
| 84 | 
         
            +
            #          [-11.5625],
         
     | 
| 85 | 
         
            +
            #          [-10.7500]]], grad_fn=<ToCopyBackward0>),
         
     | 
| 86 | 
         
            +
            #     end_scores=tensor([[-10.7500]], grad_fn=<ToCopyBackward0>),
         
     | 
| 87 | 
         
            +
            #     last_hidden_state=tensor([[[ 2.2812, -0.4219, -0.2832,  ...,  0.2715,  0.4277,  1.1875],
         
     | 
| 88 | 
         
            +
            #          [-0.3730, -0.2158,  1.2891,  ..., -1.3281,  0.6016,  0.7773],
         
     | 
| 89 | 
         
            +
            #          [ 0.2285, -1.2422,  1.0625,  ..., -1.3438,  1.1875,  1.1016],
         
     | 
| 90 | 
         
            +
            #          ...,
         
     | 
| 91 | 
         
            +
            #          [-0.8828, -2.6250,  0.9180,  ..., -0.2773,  1.7500,  0.7695],
         
     | 
| 92 | 
         
            +
            #          [ 2.0781, -4.1250, -0.1069,  ..., -0.8008,  0.4844,  0.4102],
         
     | 
| 93 | 
         
            +
            #          [ 2.9688, -1.6250,  1.1250,  ...,  0.3223,  0.0439, -2.3281]]],
         
     | 
| 94 | 
         
            +
            #        dtype=torch.bfloat16, grad_fn=<ToCopyBackward0>),
         
     | 
| 95 | 
         
            +
            #     end_last_hidden_state=tensor([[ 2.9688, -1.6250,  1.1250,  ...,  0.3223,  0.0439, -2.3281]],
         
     | 
| 96 | 
         
            +
            #        dtype=torch.bfloat16, grad_fn=<ToCopyBackward0>),
         
     | 
| 97 | 
         
            +
            #     end_index=tensor([27])
         
     | 
| 98 | 
         
             
            # )
         
     | 
| 99 | 
         
            +
            ```
         
     |