initial commit
Browse files
app.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import evaluate
|
| 2 |
+
from evaluate.utils import launch_gradio_widget
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
module = evaluate.load("saridormi/b_norm")
|
| 6 |
+
launch_gradio_widget(module)
|
b_norm.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List
|
| 2 |
+
|
| 3 |
+
import datasets
|
| 4 |
+
import evaluate
|
| 5 |
+
|
| 6 |
+
from .reused import bleuFromMaps, splitPuncts
|
| 7 |
+
|
| 8 |
+
_CITATION = """\
|
| 9 |
+
@inproceedings{tao2021evaluation,
|
| 10 |
+
title={On the Evaluation of Commit Message Generation Models: An Experimental Study},
|
| 11 |
+
author={Tao, Wei and Wang, Yanlin and Shi, Ensheng and Du, Lun and Han, Shi and Zhang, Hongyu and Zhang, Dongmei and Zhang, Wenqiang},
|
| 12 |
+
booktitle={2021 IEEE International Conference on Software Maintenance and Evolution (ICSME)},
|
| 13 |
+
pages={126--136},
|
| 14 |
+
year={2021},
|
| 15 |
+
organization={IEEE}
|
| 16 |
+
}
|
| 17 |
+
@inproceedings{Papineni02bleu:a,
|
| 18 |
+
author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
|
| 19 |
+
title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
|
| 20 |
+
booktitle = {},
|
| 21 |
+
year = {2002},
|
| 22 |
+
pages = {311--318}
|
| 23 |
+
}
|
| 24 |
+
@inproceedings{lin-och-2004-orange,
|
| 25 |
+
title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
|
| 26 |
+
author = "Lin, Chin-Yew and
|
| 27 |
+
Och, Franz Josef",
|
| 28 |
+
booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
|
| 29 |
+
month = "aug 23{--}aug 27",
|
| 30 |
+
year = "2004",
|
| 31 |
+
address = "Geneva, Switzerland",
|
| 32 |
+
publisher = "COLING",
|
| 33 |
+
url = "https://www.aclweb.org/anthology/C04-1072",
|
| 34 |
+
pages = "501--507",
|
| 35 |
+
}
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
_DESCRIPTION = """\
|
| 39 |
+
B-Norm is a variation of BLEU. It uses smoothing by Lin and Och, 2004 and does some additional preprocessing steps.
|
| 40 |
+
It was recommended for evaluation of commit message generation approaches in the
|
| 41 |
+
"On the Evaluation of Commit Message Generation Models: An Experimental Study" paper accepted to ICSME 2021.
|
| 42 |
+
This class uses implementation provided in the replication package.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class BLEUNorm(evaluate.Metric):
|
| 47 |
+
def _info(self):
|
| 48 |
+
return evaluate.MetricInfo(
|
| 49 |
+
description=_DESCRIPTION,
|
| 50 |
+
citation=_CITATION,
|
| 51 |
+
features=datasets.Features(
|
| 52 |
+
{
|
| 53 |
+
"predictions": datasets.Value("string", id="sequence"),
|
| 54 |
+
"references": datasets.Value("string", id="sequence"),
|
| 55 |
+
}
|
| 56 |
+
),
|
| 57 |
+
codebase_urls=["https://github.com/DeepSoftwareAnalytics/CommitMsgEmpirical/blob/main/metrics/B-Norm.py"],
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]: # type: ignore[override]
|
| 61 |
+
prediction_map = {i: [splitPuncts(pred.strip().lower())] for i, pred in enumerate(predictions)}
|
| 62 |
+
gold_map = {i: [splitPuncts(ref.strip().lower())] for i, ref in enumerate(references)}
|
| 63 |
+
return {"b_norm": bleuFromMaps(gold_map, prediction_map)[0] / 100.0}
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
evaluate
|
reused.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This script is copied from https://github.com/DeepSoftwareAnalytics/CommitMsgEmpirical,
|
| 3 |
+
the replication package for "On the Evaluation of Commit Message Generation Models: An Experimental Study"
|
| 4 |
+
accepted to ICSME 2021.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
#!/usr/bin/python
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
# $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
|
| 14 |
+
|
| 15 |
+
"""Provides:
|
| 16 |
+
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
|
| 17 |
+
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
|
| 18 |
+
score_cooked(alltest, n=4): Score a list of cooked test sentences.
|
| 19 |
+
score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
|
| 20 |
+
The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
import math
|
| 24 |
+
import os
|
| 25 |
+
import re
|
| 26 |
+
import subprocess
|
| 27 |
+
import sys
|
| 28 |
+
import xml.sax.saxutils
|
| 29 |
+
|
| 30 |
+
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
|
| 31 |
+
nonorm = 0
|
| 32 |
+
|
| 33 |
+
preserve_case = False
|
| 34 |
+
eff_ref_len = "shortest"
|
| 35 |
+
|
| 36 |
+
normalize1 = [
|
| 37 |
+
("<skipped>", ""), # strip "skipped" tags
|
| 38 |
+
(r"-\n", ""), # strip end-of-line hyphenation and join lines
|
| 39 |
+
(r"\n", " "), # join lines
|
| 40 |
+
# (r'(\d)\s+(?=\d)', r'\1'), # join digits
|
| 41 |
+
]
|
| 42 |
+
normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
|
| 43 |
+
|
| 44 |
+
normalize2 = [
|
| 45 |
+
(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])", r" \1 "), # tokenize punctuation. apostrophe is missing
|
| 46 |
+
(r"([^0-9])([\.,])", r"\1 \2 "), # tokenize period and comma unless preceded by a digit
|
| 47 |
+
(r"([\.,])([^0-9])", r" \1 \2"), # tokenize period and comma unless followed by a digit
|
| 48 |
+
(r"([0-9])(-)", r"\1 \2 "), # tokenize dash when preceded by a digit
|
| 49 |
+
]
|
| 50 |
+
normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def normalize(s):
|
| 54 |
+
"""Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl."""
|
| 55 |
+
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
|
| 56 |
+
if nonorm:
|
| 57 |
+
return s.split()
|
| 58 |
+
if type(s) is not str:
|
| 59 |
+
s = " ".join(s)
|
| 60 |
+
# language-independent part:
|
| 61 |
+
for (pattern, replace) in normalize1:
|
| 62 |
+
s = re.sub(pattern, replace, s)
|
| 63 |
+
s = xml.sax.saxutils.unescape(s, {""": '"'})
|
| 64 |
+
# language-dependent part (assuming Western languages):
|
| 65 |
+
s = " %s " % s
|
| 66 |
+
if not preserve_case:
|
| 67 |
+
s = s.lower() # this might not be identical to the original
|
| 68 |
+
for (pattern, replace) in normalize2:
|
| 69 |
+
s = re.sub(pattern, replace, s)
|
| 70 |
+
return s.split()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def count_ngrams(words, n=4):
|
| 74 |
+
counts = {}
|
| 75 |
+
for k in range(1, n + 1):
|
| 76 |
+
for i in range(len(words) - k + 1):
|
| 77 |
+
ngram = tuple(words[i : i + k])
|
| 78 |
+
counts[ngram] = counts.get(ngram, 0) + 1
|
| 79 |
+
return counts
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def cook_refs(refs, n=4):
|
| 83 |
+
"""Takes a list of reference sentences for a single segment
|
| 84 |
+
and returns an object that encapsulates everything that BLEU
|
| 85 |
+
needs to know about them."""
|
| 86 |
+
|
| 87 |
+
refs = [normalize(ref) for ref in refs]
|
| 88 |
+
maxcounts = {}
|
| 89 |
+
for ref in refs:
|
| 90 |
+
counts = count_ngrams(ref, n)
|
| 91 |
+
for (ngram, count) in counts.items():
|
| 92 |
+
maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
|
| 93 |
+
return ([len(ref) for ref in refs], maxcounts)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def cook_test(test, item, n=4):
|
| 97 |
+
"""Takes a test sentence and returns an object that
|
| 98 |
+
encapsulates everything that BLEU needs to know about it."""
|
| 99 |
+
(reflens, refmaxcounts) = item
|
| 100 |
+
test = normalize(test)
|
| 101 |
+
result = {}
|
| 102 |
+
result["testlen"] = len(test)
|
| 103 |
+
|
| 104 |
+
# Calculate effective reference sentence length.
|
| 105 |
+
|
| 106 |
+
if eff_ref_len == "shortest":
|
| 107 |
+
result["reflen"] = min(reflens)
|
| 108 |
+
elif eff_ref_len == "average":
|
| 109 |
+
result["reflen"] = float(sum(reflens)) / len(reflens)
|
| 110 |
+
elif eff_ref_len == "closest":
|
| 111 |
+
min_diff = None
|
| 112 |
+
for reflen in reflens:
|
| 113 |
+
if min_diff is None or abs(reflen - len(test)) < min_diff:
|
| 114 |
+
min_diff = abs(reflen - len(test))
|
| 115 |
+
result["reflen"] = reflen
|
| 116 |
+
|
| 117 |
+
result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)]
|
| 118 |
+
|
| 119 |
+
result["correct"] = [0] * n
|
| 120 |
+
counts = count_ngrams(test, n)
|
| 121 |
+
for (ngram, count) in counts.items():
|
| 122 |
+
result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
|
| 123 |
+
|
| 124 |
+
return result
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def score_cooked(allcomps, n=4, ground=0, smooth=1):
|
| 128 |
+
totalcomps = {"testlen": 0, "reflen": 0, "guess": [0] * n, "correct": [0] * n}
|
| 129 |
+
for comps in allcomps:
|
| 130 |
+
for key in ["testlen", "reflen"]:
|
| 131 |
+
totalcomps[key] += comps[key]
|
| 132 |
+
for key in ["guess", "correct"]:
|
| 133 |
+
for k in range(n):
|
| 134 |
+
totalcomps[key][k] += comps[key][k]
|
| 135 |
+
logbleu = 0.0
|
| 136 |
+
all_bleus = []
|
| 137 |
+
for k in range(n):
|
| 138 |
+
correct = totalcomps["correct"][k]
|
| 139 |
+
guess = totalcomps["guess"][k]
|
| 140 |
+
addsmooth = 0
|
| 141 |
+
if smooth == 1 and k > 0:
|
| 142 |
+
addsmooth = 1
|
| 143 |
+
logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log(guess + addsmooth + sys.float_info.min)
|
| 144 |
+
if guess == 0:
|
| 145 |
+
all_bleus.append(-10000000)
|
| 146 |
+
else:
|
| 147 |
+
all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))
|
| 148 |
+
|
| 149 |
+
logbleu /= float(n)
|
| 150 |
+
all_bleus.insert(0, logbleu)
|
| 151 |
+
|
| 152 |
+
brevPenalty = min(0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1))
|
| 153 |
+
for i in range(len(all_bleus)):
|
| 154 |
+
if i == 0:
|
| 155 |
+
all_bleus[i] += brevPenalty
|
| 156 |
+
all_bleus[i] = math.exp(all_bleus[i])
|
| 157 |
+
return all_bleus
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def bleu(refs, candidate, ground=0, smooth=1):
|
| 161 |
+
refs = cook_refs(refs)
|
| 162 |
+
test = cook_test(candidate, refs)
|
| 163 |
+
return score_cooked([test], ground=ground, smooth=smooth)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def splitPuncts(line):
|
| 167 |
+
return " ".join(re.findall(r"[\w]+|[^\s\w]", line))
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def computeMaps(predictions, goldfile):
|
| 171 |
+
predictionMap = {}
|
| 172 |
+
goldMap = {}
|
| 173 |
+
gf = open(goldfile, "r")
|
| 174 |
+
|
| 175 |
+
for row in predictions:
|
| 176 |
+
cols = row.strip().split("\t")
|
| 177 |
+
if len(cols) == 1:
|
| 178 |
+
(rid, pred) = (cols[0], "")
|
| 179 |
+
else:
|
| 180 |
+
(rid, pred) = (cols[0], cols[1])
|
| 181 |
+
predictionMap[rid] = [splitPuncts(pred.strip().lower())]
|
| 182 |
+
|
| 183 |
+
for row in gf:
|
| 184 |
+
(rid, pred) = row.split("\t")
|
| 185 |
+
if rid in predictionMap: # Only insert if the id exists for the method
|
| 186 |
+
if rid not in goldMap:
|
| 187 |
+
goldMap[rid] = []
|
| 188 |
+
goldMap[rid].append(splitPuncts(pred.strip().lower()))
|
| 189 |
+
|
| 190 |
+
return (goldMap, predictionMap)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# m1 is the reference map
|
| 194 |
+
# m2 is the prediction map
|
| 195 |
+
def bleuFromMaps(m1, m2):
|
| 196 |
+
score = [0] * 5
|
| 197 |
+
num = 0.0
|
| 198 |
+
|
| 199 |
+
for key in m1:
|
| 200 |
+
if key in m2:
|
| 201 |
+
bl = bleu(m1[key], m2[key][0])
|
| 202 |
+
score = [score[i] + bl[i] for i in range(0, len(bl))]
|
| 203 |
+
num += 1
|
| 204 |
+
return [s * 100.0 / num for s in score]
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
if __name__ == "__main__":
|
| 208 |
+
ref_sentence_lst = open(sys.argv[1]).read().split("\n")
|
| 209 |
+
with open("tmp_ref.txt", "w") as f:
|
| 210 |
+
for idx, ref_sentence in enumerate(ref_sentence_lst):
|
| 211 |
+
f.write("{}\t{}\n".format(idx, ref_sentence))
|
| 212 |
+
|
| 213 |
+
reference_file = "tmp_ref.txt"
|
| 214 |
+
predictions = []
|
| 215 |
+
for idx, row in enumerate(sys.stdin):
|
| 216 |
+
predictions.append("{}\t{}".format(idx, row))
|
| 217 |
+
if len(predictions) == len(ref_sentence_lst) - 1:
|
| 218 |
+
predictions.append("{}\t{}".format(len(predictions), ""))
|
| 219 |
+
(goldMap, predictionMap) = computeMaps(predictions, reference_file)
|
| 220 |
+
print(bleuFromMaps(goldMap, predictionMap)[0])
|
| 221 |
+
os.remove("tmp_ref.txt")
|