Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Yaml crowspairs tasks #2488

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
3393c84
Tokenizer handling
KlaudiaTH Mar 13, 2024
47c3c4a
Added ogx gsm8k implementation
KlaudiaTH Mar 13, 2024
cf9e190
Added multilingual arc, hellaswag, mmlu and truthfulqa from EleutherA…
KlaudiaTH Mar 13, 2024
8fa981d
Added ogx hellaswagx task
KlaudiaTH Mar 13, 2024
25f0b25
Added ogx arcx tasks
KlaudiaTH Mar 13, 2024
93f549c
Task fixes
KlaudiaTH Mar 13, 2024
f96e585
added truthfulqax yaml configs
jjbuschhoff Mar 13, 2024
2fcfe02
Task fixes
KlaudiaTH Mar 13, 2024
bcb4104
added mmlux task configs
jjbuschhoff Mar 15, 2024
e8d9f38
Merge branch 'yaml_multilingual_tasks' of https://github.com/OpenGPTX…
jjbuschhoff Mar 15, 2024
521504e
Task corrections: MMLU and GSM8k
KlaudiaTH Mar 18, 2024
3a0035c
added generation templates
jjbuschhoff Mar 18, 2024
de88d6b
Merge branch 'yaml_multilingual_tasks' of https://github.com/OpenGPTX…
jjbuschhoff Mar 18, 2024
4900eff
Corrected task names ogx_thruthfulqa_* to ogx_truthfulqa_*
KlaudiaTH Mar 19, 2024
4fe393c
Tasks corrections: MMLU and GSM8k
KlaudiaTH Mar 22, 2024
f6663f2
Added --bootstrap_iters command line argument
KlaudiaTH Apr 15, 2024
28bf93f
Preliminary implementation of flores and belebele
KlaudiaTH Apr 15, 2024
f8e00f7
Added flores nll fix
KlaudiaTH Apr 15, 2024
7447360
Added missing nll metric implementation
KlaudiaTH Apr 17, 2024
485acdd
added latvian flores200 configs
jjbuschhoff Jun 28, 2024
6092ac5
added lvs config for belebele, also renamed config files
jjbuschhoff Jun 28, 2024
6b544a9
FLORES-200: Added missing Latvian and removed unnecessary Irish, Malt…
KlaudiaTH Jul 6, 2024
6cc3360
HF model: Added 'nccl_timeout' model arg
KlaudiaTH Jul 5, 2024
1bd5e6a
Crowspairs task yaml
NAM00 Oct 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
8 changes: 8 additions & 0 deletions lm_eval/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ def parse_eval_args() -> argparse.Namespace:
help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.",
)
parser.add_argument(
"--bootstrap_iters",
type=int,
default=100000,
metavar="N",
help="Number of bootstrapping iterations for metric standard error estimation.",
)
parser.add_argument(
"--use_cache",
"-c",
Expand Down Expand Up @@ -238,6 +245,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
device=args.device,
use_cache=args.use_cache,
limit=args.limit,
bootstrap_iters=args.bootstrap_iters,
decontamination_ngrams_path=args.decontamination_ngrams_path,
check_integrity=args.check_integrity,
write_out=args.write_out,
Expand Down
14 changes: 13 additions & 1 deletion lm_eval/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,21 @@ def exact_match_fn(**kwargs):
output_type="loglikelihood",
aggregation="perplexity",
)
def perplexity_fn(items): # This is a passthrough function
def perplexity_fn(items):
return items

@register_aggregation("nll")
def nll(items):
return -mean(items)

@register_metric(
metric="nll",
higher_is_better=False,
output_type="loglikelihood",
aggregation="nll",
)
def nll_fn(items):
return items

@register_metric(
metric="word_perplexity",
Expand Down
46 changes: 22 additions & 24 deletions lm_eval/models/huggingface.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import copy
import os
from datetime import timedelta
from pathlib import Path
from typing import List, Literal, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import transformers
from accelerate import Accelerator, DistributedType, find_executable_batch_size
from accelerate.utils import InitProcessGroupKwargs
from packaging import version
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
Expand Down Expand Up @@ -99,6 +101,10 @@ def __init__(
**kwargs,
) -> None:
super().__init__()
nccl_timeout = timedelta(seconds=float(kwargs.pop("nccl_timeout", 3600)))
if nccl_timeout == timedelta(seconds=1800):
eval_logger.warn("nccl_timeout cannot be set to 1800 due to a bug in accelerate. Setting to 1800.001 instead.")
nccl_timeout += timedelta(milliseconds=1)

# optionally: take in an already-initialized transformers.PreTrainedModel
if not isinstance(pretrained, str):
Expand Down Expand Up @@ -132,7 +138,7 @@ def __init__(
assert isinstance(batch_size, (int, str))

gpus = torch.cuda.device_count()
accelerator = Accelerator()
accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=nccl_timeout)])
if accelerator.num_processes > 1:
self.accelerator = accelerator

Expand Down Expand Up @@ -742,34 +748,26 @@ def _select_cont_toks(self, logits, contlen=None, inplen=None):

return logits

def _encode_pair(
self, context: str, continuation: str
) -> Tuple[List[int], List[int]]:
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]

whole_enc = self.tok_encode(context + continuation, add_special_tokens=False)
context_enc = self.tok_encode(context, add_special_tokens=False)

# whole_enc = self.tok_encode(context + continuation)
# context_enc = self.tok_encode(context, add_special_tokens=False)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc

def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
new_reqs = []
for context, continuation in [req.args for req in requests]:
continuation_enc = self.tok_encode(continuation)

if context == "":
# end of text as context
context_enc, continuation_enc = (
[self.eot_token_id],
self.tok_encode(continuation),
)
context_enc = [self.eot_token_id]
else:
context_enc, continuation_enc = self._encode_pair(context, continuation)
context_enc = self.tok_encode(context, add_special_tokens=False)
ctx_cont_enc = self.tok_encode(context + continuation, add_special_tokens=False)

if context_enc + continuation_enc != ctx_cont_enc:
if ctx_cont_enc[: len(context_enc)] == context_enc:
continuation_enc = ctx_cont_enc[len(context_enc) :]
elif ctx_cont_enc[-len(continuation_enc) :] == continuation_enc:
context_enc = ctx_cont_enc[: -len(continuation_enc)]
else:
print(
f"WARNING: Unnatural tokenization of concatenated context ...{repr(context[-20:])} and continuation {repr(continuation)}"
)

new_reqs.append(((context, continuation), context_enc, continuation_enc))

Expand Down
126 changes: 126 additions & 0 deletions lm_eval/tasks/CrowsPairs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# CrowS-Pairs

### Paper

CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models
https://aclanthology.org/2020.emnlp-main.154/
French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked
language models to a language other than English
https://aclanthology.org/2022.acl-long.583/

CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency
to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has
a newer version which fixes some of the issues with the original version.

Homepage: https://github.com/nyu-mll/crows-pairs, https://gitlab.inria.fr/french-crows-pairs

### Citation

```bibtex
@inproceedings{nangia-etal-2020-crows,
title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models",
author = "Nangia, Nikita and
Vania, Clara and
Bhalerao, Rasika and
Bowman, Samuel R.",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.154",
doi = "10.18653/v1/2020.emnlp-main.154",
pages = "1953--1967",
abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.",
}
```

### Groups and Tasks

#### Groups

- `crows_pairs_english`: The entire English subset of the CrowS-Pairs dataset.
- `crows_pairs_french`: The entire French subset of the CrowS-Pairs dataset.
- `crows_pairs_german`: The entire German subset of the CrowS-Pairs dataset.
- `crows_pairs_spanish`: The entire Spanish subset of the CrowS-Pairs dataset.
- `crows_pairs_italian`: The entire Italian subset of the CrowS-Pairs dataset.


#### Tasks


The following tasks evaluate sub-areas of bias in the English CrowS-Pairs dataset:
- `crows_pairs_english_age`
- `crows_pairs_english_autre`
- `crows_pairs_english_disability`
- `crows_pairs_english_gender`
- `crows_pairs_english_nationality`
- `crows_pairs_english_physical_appearance`
- `crows_pairs_english_race_color`
- `crows_pairs_english_religion`
- `crows_pairs_english_sexual_orientation`
- `crows_pairs_english_socioeconomic`

The following tasks evaluate sub-areas of bias in the French CrowS-Pairs dataset:
- `crows_pairs_french_age`
- `crows_pairs_french_autre`
- `crows_pairs_french_disability`
- `crows_pairs_french_gender`
- `crows_pairs_french_nationality`
- `crows_pairs_french_physical_appearance`
- `crows_pairs_french_race_color`
- `crows_pairs_french_religion`
- `crows_pairs_french_sexual_orientation`
- `crows_pairs_french_socioeconomic`

The following tasks evaluate sub-areas of bias in the German CrowS-Pairs dataset:
- `crows_pairs_german_age`
- `crows_pairs_german_autre`
- `crows_pairs_german_disability`
- `crows_pairs_german_gender`
- `crows_pairs_german_nationality`
- `crows_pairs_german_physical_appearance`
- `crows_pairs_german_race_color`
- `crows_pairs_german_religion`
- `crows_pairs_german_sexual_orientation`
- `crows_pairs_german_socioeconomic`

The following tasks evaluate sub-areas of bias in the Spanish CrowS-Pairs dataset:
- `crows_pairs_spanish_age`
- `crows_pairs_spanish_autre`
- `crows_pairs_spanish_disability`
- `crows_pairs_spanish_gender`
- `crows_pairs_spanish_nationality`
- `crows_pairs_spanish_physical_appearance`
- `crows_pairs_spanish_race_color`
- `crows_pairs_spanish_religion`
- `crows_pairs_spanish_sexual_orientation`
- `crows_pairs_spanish_socioeconomic`


The following tasks evaluate sub-areas of bias in the Italian CrowS-Pairs dataset:
- `crows_pairs_italian_age`
- `crows_pairs_italian_autre`
- `crows_pairs_italian_disability`
- `crows_pairs_italian_gender`
- `crows_pairs_italian_nationality`
- `crows_pairs_italian_physical_appearance`
- `crows_pairs_italian_race_color`
- `crows_pairs_italian_religion`
- `crows_pairs_italian_sexual_orientation`
- `crows_pairs_italian_socioeconomic`


All tasks evaluate the percentage of more-stereotypical sentences that are rated as more likely by a model than the non-stereotypical sentences (`pct_stereotype`), as well as the average absolute difference of loglikelihoods between the sentences in the pairs.

### Checklist

* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [x] The original paper does not for causal language models, so this is a novel formulation of the task for autoregressive LMs.

If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
23 changes: 23 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_english.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
group:
- crows_pairs_tasks
- social_bias
- loglikelihood
task: crows_pairs_english
dataset_path: openGPT-X/crows_pairs_en
dataset_name:
test_split: test
output_type: multiple_choice
doc_to_text: ""
doc_to_target: 0
doc_to_choice: !function utils.doc_to_choice
target_delimiter: ""
process_results: !function utils.process_results
metric_list:
- metric: likelihood_diff
aggregation: mean
higher_is_better: false
- metric: pct_stereotype
aggregation: mean
higher_is_better: false
metadata:
version: 1.0
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_english_age.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_age
dataset_name:
process_docs: !function utils.filter_age
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_english_autre.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_autre
dataset_name:
process_docs: !function utils.filter_autre
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_disability
dataset_name:
process_docs: !function utils.filter_disability
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_english_gender.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_gender
dataset_name:
process_docs: !function utils.filter_gender
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_nationality
dataset_name:
process_docs: !function utils.filter_nationality
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_physical_appearance
dataset_name:
process_docs: !function utils.filter_appearance
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_race_color
dataset_name:
process_docs: !function utils.filter_race_color
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_english_religion.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_religion
dataset_name:
process_docs: !function utils.filter_religion
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_sexual_orientation
dataset_name:
process_docs: !function utils.filter_orientation
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_english.yaml
task: crows_pairs_english_socioeconomic
dataset_name:
process_docs: !function utils.filter_socio
23 changes: 23 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_french.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
group:
- crows_pairs_tasks
- social_bias
- loglikelihood
task: crows_pairs_french
dataset_path: openGPT-X/crows_pairs_fr
dataset_name:
test_split: test
output_type: multiple_choice
doc_to_text: ""
doc_to_target: 0
doc_to_choice: !function utils.doc_to_choice
target_delimiter: ""
process_results: !function utils.process_results
metric_list:
- metric: likelihood_diff
aggregation: mean
higher_is_better: false
- metric: pct_stereotype
aggregation: mean
higher_is_better: false
metadata:
version: 1.0
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_french_age.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_french.yaml
task: crows_pairs_french_age
dataset_name:
process_docs: !function utils.filter_age
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_french_autre.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_french.yaml
task: crows_pairs_french_autre
dataset_name:
process_docs: !function utils.filter_autre
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_french_disability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_french.yaml
task: crows_pairs_french_disability
dataset_name:
process_docs: !function utils.filter_disability
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_french_gender.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_french.yaml
task: crows_pairs_french_gender
dataset_name:
process_docs: !function utils.filter_gender
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_french.yaml
task: crows_pairs_french_nationality
dataset_name:
process_docs: !function utils.filter_nationality
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_french.yaml
task: crows_pairs_french_physical_appearance
dataset_name:
process_docs: !function utils.filter_appearance
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_french_race_color.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_french.yaml
task: crows_pairs_french_race_color
dataset_name:
process_docs: !function utils.filter_race_color
4 changes: 4 additions & 0 deletions lm_eval/tasks/CrowsPairs/crows_pairs_french_religion.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_french.yaml
task: crows_pairs_french_religion
dataset_name:
process_docs: !function utils.filter_religion
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: crows_pairs_french.yaml
task: crows_pairs_french_sexual_orientation
dataset_name:
process_docs: !function utils.filter_orientation
Loading