1import numpy as np2import os3from nltk import PorterStemmer, word_tokenize, sent_tokenize4from nltk.corpus import stopwords5from tqdm import tqdm6from scipy.stats import spearmanr, pearsonr, kendalltau7from pytorch_transformers import *8from sklearn.metrics.pairwise import cosine_similarity9from sentence_transformers import SentenceTransformer10import argparse1112from scorer.data_helper.json_reader import read_sorted_scores, read_articles, read_processed_scores, read_scores13from helpers.data_helpers import sent2stokens_wostop, sent2tokens_wostop, sent2stokens, text_normalization14from scorer.auto_metrics.metrics import bleu, meteor15from resources import RUNS_DIR, ROUGE_DIR, BASE_DIR, MODEL_WEIGHT_DIR16from scorer.auto_metrics.rouge.rouge import RougeScorer17from step1_encode_doc_summ import raw_bert_encoder18from rewarder import Rewarder1920def sts_bert_encoder(model, sent_list):21 if not isinstance(sent_list,list):22 assert isinstance(sent_list,str)23 sent_list = sent_tokenize(sent_list)24 vecs = model.encode(sent_list)25 return vecs262728def sts_bert_rewarder(model, text1, text2):29 vec_list1 = sts_bert_encoder(model,text1)30 vec_list2 = sts_bert_encoder(model,text2)31 avg_vec1 = np.mean(vec_list1,axis=0)32 avg_vec2 = np.mean(vec_list2,axis=0)33 return cosine_similarity(avg_vec1.reshape(1, -1), avg_vec2.reshape(1, -1))[0][0]343536def raw_bert_rewarder(model, tokenizer, text1, text2):37 v1 = raw_bert_encoder(model,tokenizer,[text1])38 v2 = raw_bert_encoder(model,tokenizer,[text2])39 return cosine_similarity(v1.reshape(1, -1), v2.reshape(1, -1))[0][0]404142def evaluate_metric(metric, stem, remove_stop, with_ref, prompt='overall'):43 ''' metrics that use reference summaries '''44 assert metric in ['ROUGE-1-F', 'ROUGE-1-R', 'ROUGE-2-F', 'ROUGE-2-R', 'ROUGE-L-F', 'ROUGE-L-R', 'ROUGE-SU*-F',45 'ROUGE-SU*-R', 'bleu-1', 'bleu-2', 'bleu-3', 'bleu-4', 'bleu-5', 'meteor',46 'infersent', 'bert-raw','bert-sts','bert-nli','bert-human','mover-1', 'mover-2', 'mover-smd']47 stemmed_str = "_stem" if stem else ""48 stop_str = "_removestop" if remove_stop else ""49 if with_ref:50 ranks_file_path = os.path.join('outputs', 'wref_{}{}{}_{}_rank_correlation.csv'.format(metric, stemmed_str, stop_str, prompt))51 else:52 ranks_file_path = os.path.join('outputs', 'woref_{}{}{}_{}_rank_correlation.csv'.format(metric, stemmed_str, stop_str, prompt))53 print('\n====={}=====\n'.format(ranks_file_path))5455 #if os.path.isfile(ranks_file_path):56 #return ranks_file_path5758 ranks_file = open(ranks_file_path, 'w')59 ranks_file.write('article,summ_id,human_score,metric_score\n')6061 sorted_scores = read_sorted_scores()62 input_articles, _ = read_articles()63 corr_data = np.zeros((len(sorted_scores), 3))6465 stopwords_list = set(stopwords.words("english"))66 stemmer = PorterStemmer()6768 if metric.startswith('infersent'):69 from scorer.auto_metrics.infersent_metric import InferSentScorer70 infers = InferSentScorer()71 elif metric.startswith('sent2vec'):72 from scorer.auto_metrics.sent2vec_metric import Sent2Vec73 s2v = Sent2Vec()74 elif metric.startswith('bert'):75 pass76 if 'human' in metric:77 rewarder = Rewarder(os.path.join(MODEL_WEIGHT_DIR,'sample.model'))78 elif 'sts' in metric:79 bert_model = SentenceTransformer('bert-large-nli-stsb-mean-tokens')80 elif 'nli' in metric:81 bert_model = SentenceTransformer('bert-large-nli-mean-tokens')82 else:83 #raw BERT84 bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')85 bert_model = BertModel.from_pretrained('bert-large-uncased')86 elif metric.startswith('mover'):87 print('Make sure that your have started the mover server. Find details at')88 from summ_eval.client import EvalClient89 mover_scorer = EvalClient()9091 for i, (article_id, scores_list) in tqdm(enumerate(sorted_scores.items())):92 human_ranks = [s['scores'][prompt] for s in scores_list]93 if len(human_ranks) < 2: continue94 ref_summ = scores_list[0]['ref']95 article = [entry['article'] for entry in input_articles if entry['id']==article_id][0]9697 if stem and remove_stop:98 sys_summs = [" ".join(sent2stokens_wostop(s['sys_summ'], stemmer, stopwords_list, 'english', True)) for s in scores_list]99 ref_summ = " ".join(sent2stokens_wostop(ref_summ, stemmer, stopwords_list, 'english', True))100 article = " ".join(sent2stokens_wostop(article, stemmer, stopwords_list, 'english', True))101 elif not stem and remove_stop:102 sys_summs = [" ".join(sent2tokens_wostop(s['sys_summ'], stopwords_list, 'english', True)) for s in scores_list]103 ref_summ = " ".join(sent2tokens_wostop(ref_summ, stopwords_list, 'english', True))104 article = " ".join(sent2tokens_wostop(article, stopwords_list, 'english', True))105 elif not remove_stop and stem:106 sys_summs = [" ".join(sent2stokens(s['sys_summ'], stemmer, 'english', True)) for s in scores_list]107 ref_summ = " ".join(sent2stokens(ref_summ, stemmer, 'english', True))108 article = " ".join(sent2stokens(article, stemmer, 'english', True))109 else:110 sys_summs = [s['sys_summ'] for s in scores_list]111112 summ_ids = [s['summ_id'] for s in scores_list]113 sys_summs = [text_normalization(s) for s in sys_summs]114 ref_summ = text_normalization(ref_summ)115 article = text_normalization(article)116117 if 'rouge' in metric.lower():118 auto_metric_ranks = []119 for ss in sys_summs:120 rouge_scorer = RougeScorer(ROUGE_DIR,BASE_DIR)121 if with_ref: auto_metric_ranks.append(rouge_scorer(ss, ref_summ)[metric])122 else: auto_metric_ranks.append(rouge_scorer(ss, article)[metric])123 elif metric.startswith('bleu'):124 n = int(metric.split('-')[1])125 if with_ref: auto_metric_ranks = [bleu(ss, [ref_summ], n, smooth=False) for ss in sys_summs]126 else: auto_metric_ranks = [bleu(ss, [article], n, smooth=False) for ss in sys_summs]127 elif metric.startswith('meteor'):128 if with_ref: auto_metric_ranks = [meteor(ss, [ref_summ]) for ss in sys_summs]129 else: auto_metric_ranks = [meteor(ss, [article]) for ss in sys_summs]130 elif metric.startswith('infersent'):131 if with_ref: auto_metric_ranks = [infers(ss, ref_summ) for ss in sys_summs]132 else: auto_metric_ranks = [infers(ss, article) for ss in sys_summs]133 elif metric.startswith('sent2vec'):134 if with_ref: auto_metric_ranks = [s2v.score(ss, ref_summ) for ss in sys_summs]135 else: auto_metric_ranks = [s2v.score(ss, article) for ss in sys_summs]136 elif metric.startswith('bert'):137 if 'human' in metric:138 if with_ref: auto_metric_ranks = [rewarder(ref_summ,ss) for ss in sys_summs]139 else: auto_metric_ranks = [rewarder(article,ss) for ss in sys_summs]140 elif 'sts' in metric or 'nli' in metric:141 if with_ref: auto_metric_ranks = [sts_bert_rewarder(bert_model,ss,ref_summ) for ss in sys_summs]142 else: auto_metric_ranks = [sts_bert_rewarder(bert_model,ss,article) for ss in sys_summs]143 else: #raw BERT encoder144 if with_ref: auto_metric_ranks = [raw_bert_rewarder(bert_model,bert_tokenizer,ss,ref_summ) for ss in sys_summs]145 else: auto_metric_ranks = [raw_bert_rewarder(bert_model,bert_tokenizer,ss,article) for ss in sys_summs]146 elif metric.startswith('mover'):147 if '1' in metric: mm = 'wmd_1'148 elif '2' in metric: mm = 'wmd_2'149 else: mm = 'smd'150 if with_ref: cases = [ [[ss], [ref_summ], mm] for ss in sys_summs ]151 else: cases = [ [[ss], sent_tokenize(article), mm] for ss in sys_summs ]152 auto_metric_ranks = mover_scorer.eval(cases)['0']153154 for sid, amr, hr in zip(summ_ids, auto_metric_ranks, human_ranks):155 ranks_file.write('{},{},{:.2f},{:.4f}\n'.format(article_id, sid, hr, amr))156157 spearmanr_result = spearmanr(human_ranks, auto_metric_ranks)158 print(spearmanr_result[0])159 pearsonr_result = pearsonr(human_ranks, auto_metric_ranks)160 kendalltau_result = kendalltau(human_ranks, auto_metric_ranks)161 corr_data[i, :] = [spearmanr_result[0], pearsonr_result[0], kendalltau_result[0]]162163 corr_mean_all = np.nanmean(corr_data, axis=0)164 print('\n====={}=====\n'.format(ranks_file_path))165 print("Correlation mean on all data spearman/pearsonr/kendall: {}".format(corr_mean_all))166167 ranks_file.flush()168 ranks_file.close()169170 return ranks_file_path171172def parse_args():173 ap = argparse.ArgumentParser("arguments for summary sampler")174 ap.add_argument('-m','--metric',type=str,default='mover-1',choices=['ROUGE-1-F', 'ROUGE-1-R', 'ROUGE-2-F', 'ROUGE-2-R', 'ROUGE-L-F', 'ROUGE-L-R', 'ROUGE-SU*-F',175 'ROUGE-SU*-R', 'bleu-1', 'bleu-2', 'bleu-3', 'bleu-4', 'bleu-5', 'meteor',176 'infersent', 'bert-raw','bert-sts','bert-nli','bert-human', 'mover-1', 'mover-2', 'mover-smd'],help='compare which metric against the human judgements')177 ap.add_argument('-p','--prompt',type=str,default='overall',help='which human ratings you want to use as ground truth',choices=['overall','grammar'])178 ap.add_argument('-r','--with_ref',type=int,default=0,help='whether to use references in your metric; 1: yes, 0: no')179 ap.add_argument('-s','--stem',type=int,help='whether stem the texts before computing the metrics; 1 yes, 0 no')180 ap.add_argument('-rs','--remove_stop',type=int,help='whether remove stop words in texts before computing the metrics; 1 yes, 0 no')181 args = ap.parse_args()182 return args.metric, args.prompt, args.with_ref, args.stem, args.remove_stop183184185if __name__ == '__main__':186 metric, prompt, with_ref, stem, remove_stop = parse_args()187 with_ref = bool(with_ref)188 stem = bool(stem)189 remove_stop = bool(remove_stop)190191 print('\n=====Arguments====')192 print('metric: '+metric)193 print('prompt: '+prompt)194 print('with ref: '+repr(with_ref))195 print('stem: '+repr(stem))196 print('remove stopwords: '+repr(remove_stop))197 print('=====Arguments====\n')198 ...

...62 if isinstance(self.src, Variable):63 with self.src.open_for_read(out) as srcref:64 scaled, is_temp = self.dest.scale_other_to_this(self.src,65 srcref, out)66 out.write(self.with_ref(ref, scaled))67 if is_temp:68 out.free_temp(scaled)69 else:70 self.apply_const_src(ref, self.dest.to_int(self.src), out)71 if not self.is_additive:72 self.dest.scale_down(ref, out)73 def apply_const_src(self, ref, val, out):74 if val < 0 and self.with_neg_const is not None:75 out.write(self.with_neg_const(ref, -val))76 else:77 out.write(self.with_const(ref, val))78 def run(self, ev):79 assert isinstance(self.dest, CompilerVariable)80 if isinstance(self.src, Variable):81 assert isinstance(self.src, CompilerVariable)82 src = self.src.get_value()83 else:84 src = self.src85 self.dest.set_value(self.constfunc(self.dest.get_value(), src))86 def serialize(self, holder):87 dest, src = self.serialize_args(holder)88 return '%s %s %s' % (dest, self.with_ref.op, src)89 __op_lookup = {}90 @classmethod91 def lookup_by_op(cls, op):92 if not len(cls.__op_lookup):93 for clz in get_subclasses(cls):94 if hasattr(clz, 'with_ref'):95 cls.__op_lookup[clz.with_ref.op] = clz96 return cls.__op_lookup[op]97import operator98class OnlyRefOperationInsn(SimpleOperationInsn):99 def apply_const_src(self, ref, val, out):100 srcref = out.allocate_temp()101 out.write(c.SetConst(srcref, val))102 out.write(self.with_ref(ref, srcref))103 out.free_temp(srcref)104class AddScore(SimpleOperationInsn):105 with_ref = c.OpAdd106 with_const = c.AddConst107 with_neg_const = c.RemConst108 constfunc = operator.add109 identity = 0110 is_additive = True111class SubScore(SimpleOperationInsn):112 with_ref = c.OpSub113 with_const = c.RemConst114 with_neg_const = c.AddConst115 constfunc = operator.sub116 identity = 0...

