Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import json | |
| import os | |
| import spaces | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import huggingface_hub | |
| import prep_decompiled | |
| description = """# ReSym Test Space | |
| This is a test space of the models from the [ReSym | |
| artifacts](https://github.com/lt-asset/resym). Sadly, at the time I am writing | |
| this, not all of ReSym is publicly available; specifically, the Prolog component | |
| is [not available](https://github.com/lt-asset/resym/issues/2). | |
| This space simply performs inference on the two pretrained models available as | |
| part of the ReSym artifacts. It takes a variable name and some decompiled code | |
| as input, and outputs the variable type and other information. | |
| ## Todo | |
| * Add support for FieldDecoder model | |
| """ | |
| hf_key = os.environ["HF_TOKEN"] | |
| huggingface_hub.login(token=hf_key) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "bigcode/starcoderbase-3b" | |
| ) | |
| vardecoder_model = AutoModelForCausalLM.from_pretrained( | |
| "ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto" | |
| ) | |
| example = """{ | |
| "input": "What are the original name and data type of variables `a1`, `a2`, `k`, `j`, `i`?\n```\n_BYTE *__fastcall sub_4022CD(_BYTE *a1, __int64 a2)\n{\n_BYTE *result; // rax\n__int16 v4; // [rsp+1Ch] [rbp-14h]\nunsigned __int16 v5; // [rsp+1Eh] [rbp-12h]\nunsigned __int16 v6; // [rsp+20h] [rbp-10h]\nunsigned __int16 v7; // [rsp+22h] [rbp-Eh]\nunsigned int k; // [rsp+24h] [rbp-Ch]\nunsigned int j; // [rsp+28h] [rbp-8h]\nunsigned int i; // [rsp+2Ch] [rbp-4h]\n\nfor ( i = 0; i <= 2; ++i )\n{\nfor ( j = 0; j <= 0x3F; ++j )\n{\nfor ( k = 0; k <= 3; ++k )\n{\n*(&v4 + k) = *(_WORD *)(a2 + 2 * (k + 4 * j + ((unsigned __int64)i << 8)));\n*(&v4 + k) += (*(&v4 + k) >> 15) & 0xD01;\n*(&v4 + k) = ((((unsigned __int16)*(&v4 + k) << 10) + 1664) / 0xD01u) & 0x3FF;\n}\n*a1 = v4;\na1[1] = (4 * v5) | HIBYTE(v4);\na1[2] = (16 * v6) | (v5 >> 6);\na1[3] = ((_BYTE)v7 << 6) | (v6 >> 4);\nresult = a1 + 4;\na1[4] = v7 >> 2;\na1 += 5;\n}\n}\nreturn result;\n}\n```", | |
| "output": "a1: r, uint8_t*\na2: a, const polyvec*\nk: t, uint16_t\nj: -, -\ni: k, unsigned int", | |
| "funname": "pqcrystals_kyber768_ref_polyvec_compress", | |
| "bin": "6ea440a6c772bc0d6a6089c9ff33ae31da13daf3b72acbe175674b0bb21987ed", | |
| "proj": "pq-crystals/kyber", | |
| "cluster_var": { | |
| "array": [ | |
| [ | |
| "k", | |
| "j" | |
| ] | |
| ] | |
| } | |
| }""" | |
| def infer(var_name, code): | |
| splitcode = code.splitlines() | |
| comments = prep_decompiled.extract_comments(splitcode) | |
| sig = prep_decompiled.parse_signature(splitcode) | |
| print(f"comments={comments} sig={sig}") | |
| #line = json.loads(input) | |
| #first_token = line["output"].split(":")[0] | |
| prompt = code + var_name + ":" | |
| input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()[:, : 8192 - 1024] | |
| output = vardecoder_model.generate( | |
| input_ids=input_ids, | |
| max_new_tokens=1024, | |
| num_beams=4, | |
| num_return_sequences=1, | |
| do_sample=False, | |
| early_stopping=False, | |
| pad_token_id=0, | |
| eos_token_id=0, | |
| )[0] | |
| output = tokenizer.decode( | |
| output[input_ids.size(1) :], | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True, | |
| ) | |
| output = var_name + ":" + output | |
| return output | |
| demo = gr.Interface( | |
| fn=infer, | |
| inputs=[ | |
| gr.Text(label="First Token", value="a1"), | |
| gr.Textbox(lines=10, value=json.loads(example)['input']), | |
| ], | |
| outputs=gr.Text(label="Var Decoder Output"), | |
| description=description | |
| ) | |
| demo.launch() | |