EleutherAI · NAM00 · Mar 13, 2024 · Mar 13, 2024 · Mar 13, 2024 · Mar 13, 2024
@@ -86,6 +86,13 @@ def parse_eval_args() -> argparse.Namespace:
         help="Limit the number of examples per task. "
         "If <1, limit is a percentage of the total number of examples.",
     )
+    parser.add_argument(
+        "--bootstrap_iters",
+        type=int,
+        default=100000,
+        metavar="N",
+        help="Number of bootstrapping iterations for metric standard error estimation.",
+    )
     parser.add_argument(
         "--use_cache",
         "-c",
@@ -238,6 +245,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         device=args.device,
         use_cache=args.use_cache,
         limit=args.limit,
+        bootstrap_iters=args.bootstrap_iters,
         decontamination_ngrams_path=args.decontamination_ngrams_path,
         check_integrity=args.check_integrity,
         write_out=args.write_out,

@@ -159,9 +159,21 @@ def exact_match_fn(**kwargs):
     output_type="loglikelihood",
     aggregation="perplexity",
 )
-def perplexity_fn(items):  # This is a passthrough function
+def perplexity_fn(items):
     return items
 
+@register_aggregation("nll")
+def nll(items):
+    return -mean(items)
+
+@register_metric(
+    metric="nll",
+    higher_is_better=False,
+    output_type="loglikelihood",
+    aggregation="nll",
+)
+def nll_fn(items):
+    return items
 
 @register_metric(
     metric="word_perplexity",

@@ -1,12 +1,14 @@
 import copy
 import os
+from datetime import timedelta
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 import transformers
 from accelerate import Accelerator, DistributedType, find_executable_batch_size
+from accelerate.utils import InitProcessGroupKwargs
 from packaging import version
 from peft import PeftModel
 from peft import __version__ as PEFT_VERSION
@@ -99,6 +101,10 @@ def __init__(
         **kwargs,
     ) -> None:
         super().__init__()
+        nccl_timeout = timedelta(seconds=float(kwargs.pop("nccl_timeout", 3600)))
+        if nccl_timeout == timedelta(seconds=1800):
+            eval_logger.warn("nccl_timeout cannot be set to 1800 due to a bug in accelerate. Setting to 1800.001 instead.")
+            nccl_timeout += timedelta(milliseconds=1)
 
         # optionally: take in an already-initialized transformers.PreTrainedModel
         if not isinstance(pretrained, str):
@@ -132,7 +138,7 @@ def __init__(
             assert isinstance(batch_size, (int, str))
 
             gpus = torch.cuda.device_count()
-            accelerator = Accelerator()
+            accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=nccl_timeout)])
             if accelerator.num_processes > 1:
                 self.accelerator = accelerator
 
@@ -742,34 +748,26 @@ def _select_cont_toks(self, logits, contlen=None, inplen=None):
 
         return logits
 
-    def _encode_pair(
-        self, context: str, continuation: str
-    ) -> Tuple[List[int], List[int]]:
-        n_spaces = len(context) - len(context.rstrip())
-        if n_spaces > 0:
-            continuation = context[-n_spaces:] + continuation
-            context = context[:-n_spaces]
-
-        whole_enc = self.tok_encode(context + continuation, add_special_tokens=False)
-        context_enc = self.tok_encode(context, add_special_tokens=False)
-
-        # whole_enc = self.tok_encode(context + continuation)
-        # context_enc = self.tok_encode(context, add_special_tokens=False)
-        context_enc_len = len(context_enc)
-        continuation_enc = whole_enc[context_enc_len:]
-        return context_enc, continuation_enc
-
     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         new_reqs = []
         for context, continuation in [req.args for req in requests]:
+            continuation_enc = self.tok_encode(continuation)
+
             if context == "":
-                # end of text as context
-                context_enc, continuation_enc = (
-                    [self.eot_token_id],
-                    self.tok_encode(continuation),
-                )
+                context_enc = [self.eot_token_id]
             else:
-                context_enc, continuation_enc = self._encode_pair(context, continuation)
+                context_enc = self.tok_encode(context, add_special_tokens=False)
+                ctx_cont_enc = self.tok_encode(context + continuation, add_special_tokens=False)
+
+                if context_enc + continuation_enc != ctx_cont_enc:
+                    if ctx_cont_enc[: len(context_enc)] == context_enc:
+                        continuation_enc = ctx_cont_enc[len(context_enc) :]
+                    elif ctx_cont_enc[-len(continuation_enc) :] == continuation_enc:
+                        context_enc = ctx_cont_enc[: -len(continuation_enc)]
+                    else:
+                        print(
+                            f"WARNING: Unnatural tokenization of concatenated context ...{repr(context[-20:])} and continuation {repr(continuation)}"
+                        )
 
             new_reqs.append(((context, continuation), context_enc, continuation_enc))
 

@@ -0,0 +1,126 @@
+# CrowS-Pairs
+
+### Paper
+
+CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models
+https://aclanthology.org/2020.emnlp-main.154/
+French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked
+language models to a language other than English
+https://aclanthology.org/2022.acl-long.583/
+
+CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency
+to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has
+a newer version which fixes some of the issues with the original version.
+
+Homepage: https://github.com/nyu-mll/crows-pairs, https://gitlab.inria.fr/french-crows-pairs
+
+### Citation
+
+```bibtex
+@inproceedings{nangia-etal-2020-crows,
+    title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models",
+    author = "Nangia, Nikita  and
+      Vania, Clara  and
+      Bhalerao, Rasika  and
+      Bowman, Samuel R.",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
+    month = nov,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.emnlp-main.154",
+    doi = "10.18653/v1/2020.emnlp-main.154",
+    pages = "1953--1967",
+    abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.",
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `crows_pairs_english`: The entire English subset of the CrowS-Pairs dataset.
+- `crows_pairs_french`: The entire French subset of the CrowS-Pairs dataset.
+- `crows_pairs_german`: The entire German subset of the CrowS-Pairs dataset.
+- `crows_pairs_spanish`: The entire Spanish subset of the CrowS-Pairs dataset.
+- `crows_pairs_italian`: The entire Italian subset of the CrowS-Pairs dataset.
+
+
+#### Tasks
+
+
+The following tasks evaluate sub-areas of bias in the English CrowS-Pairs dataset:
+- `crows_pairs_english_age`
+- `crows_pairs_english_autre`
+- `crows_pairs_english_disability`
+- `crows_pairs_english_gender`
+- `crows_pairs_english_nationality`
+- `crows_pairs_english_physical_appearance`
+- `crows_pairs_english_race_color`
+- `crows_pairs_english_religion`
+- `crows_pairs_english_sexual_orientation`
+- `crows_pairs_english_socioeconomic`
+
+The following tasks evaluate sub-areas of bias in the French CrowS-Pairs dataset:
+- `crows_pairs_french_age`
+- `crows_pairs_french_autre`
+- `crows_pairs_french_disability`
+- `crows_pairs_french_gender`
+- `crows_pairs_french_nationality`
+- `crows_pairs_french_physical_appearance`
+- `crows_pairs_french_race_color`
+- `crows_pairs_french_religion`
+- `crows_pairs_french_sexual_orientation`
+- `crows_pairs_french_socioeconomic`
+
+The following tasks evaluate sub-areas of bias in the German CrowS-Pairs dataset:
+- `crows_pairs_german_age`
+- `crows_pairs_german_autre`
+- `crows_pairs_german_disability`
+- `crows_pairs_german_gender`
+- `crows_pairs_german_nationality`
+- `crows_pairs_german_physical_appearance`
+- `crows_pairs_german_race_color`
+- `crows_pairs_german_religion`
+- `crows_pairs_german_sexual_orientation`
+- `crows_pairs_german_socioeconomic`
+
+The following tasks evaluate sub-areas of bias in the Spanish CrowS-Pairs dataset:
+- `crows_pairs_spanish_age`
+- `crows_pairs_spanish_autre`
+- `crows_pairs_spanish_disability`
+- `crows_pairs_spanish_gender`
+- `crows_pairs_spanish_nationality`
+- `crows_pairs_spanish_physical_appearance`
+- `crows_pairs_spanish_race_color`
+- `crows_pairs_spanish_religion`
+- `crows_pairs_spanish_sexual_orientation`
+- `crows_pairs_spanish_socioeconomic`
+
+
+The following tasks evaluate sub-areas of bias in the Italian CrowS-Pairs dataset:
+- `crows_pairs_italian_age`
+- `crows_pairs_italian_autre`
+- `crows_pairs_italian_disability`
+- `crows_pairs_italian_gender`
+- `crows_pairs_italian_nationality`
+- `crows_pairs_italian_physical_appearance`
+- `crows_pairs_italian_race_color`
+- `crows_pairs_italian_religion`
+- `crows_pairs_italian_sexual_orientation`
+- `crows_pairs_italian_socioeconomic`
+
+
+All tasks evaluate the percentage of more-stereotypical sentences that are rated as more likely by a model than the non-stereotypical sentences (`pct_stereotype`), as well as the average absolute difference of loglikelihoods between the sentences in the pairs.
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] The original paper does not for causal language models, so this is a novel formulation of the task for autoregressive LMs.
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
@@ -0,0 +1,23 @@
+group:
+  - crows_pairs_tasks
+  - social_bias
+  - loglikelihood
+task: crows_pairs_english
+dataset_path: openGPT-X/crows_pairs_en
+dataset_name:
+test_split: test
+output_type: multiple_choice
+doc_to_text: ""
+doc_to_target: 0
+doc_to_choice: !function utils.doc_to_choice
+target_delimiter: ""
+process_results: !function utils.process_results
+metric_list:
+  - metric: likelihood_diff
+    aggregation: mean
+    higher_is_better: false
+  - metric: pct_stereotype
+    aggregation: mean
+    higher_is_better: false
+metadata:
+  version: 1.0
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_age
+dataset_name:
+process_docs: !function utils.filter_age
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_autre
+dataset_name:
+process_docs: !function utils.filter_autre
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_disability
+dataset_name:
+process_docs: !function utils.filter_disability
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_gender
+dataset_name:
+process_docs: !function utils.filter_gender
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_nationality
+dataset_name:
+process_docs: !function utils.filter_nationality
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_physical_appearance
+dataset_name:
+process_docs: !function utils.filter_appearance
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_race_color
+dataset_name:
+process_docs: !function utils.filter_race_color
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_religion
+dataset_name:
+process_docs: !function utils.filter_religion
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_sexual_orientation
+dataset_name:
+process_docs: !function utils.filter_orientation
@@ -0,0 +1,4 @@
+include: crows_pairs_english.yaml
+task: crows_pairs_english_socioeconomic
+dataset_name:
+process_docs: !function utils.filter_socio
@@ -0,0 +1,23 @@
+group:
+  - crows_pairs_tasks
+  - social_bias
+  - loglikelihood
+task: crows_pairs_french
+dataset_path: openGPT-X/crows_pairs_fr
+dataset_name:
+test_split: test
+output_type: multiple_choice
+doc_to_text: ""
+doc_to_target: 0
+doc_to_choice: !function utils.doc_to_choice
+target_delimiter: ""
+process_results: !function utils.process_results
+metric_list:
+  - metric: likelihood_diff
+    aggregation: mean
+    higher_is_better: false
+  - metric: pct_stereotype
+    aggregation: mean
+    higher_is_better: false
+metadata:
+  version: 1.0
@@ -0,0 +1,4 @@
+include: crows_pairs_french.yaml
+task: crows_pairs_french_age
+dataset_name:
+process_docs: !function utils.filter_age
@@ -0,0 +1,4 @@
+include: crows_pairs_french.yaml
+task: crows_pairs_french_autre
+dataset_name:
+process_docs: !function utils.filter_autre
@@ -0,0 +1,4 @@
+include: crows_pairs_french.yaml
+task: crows_pairs_french_disability
+dataset_name:
+process_docs: !function utils.filter_disability
@@ -0,0 +1,4 @@
+include: crows_pairs_french.yaml
+task: crows_pairs_french_gender
+dataset_name:
+process_docs: !function utils.filter_gender
@@ -0,0 +1,4 @@
+include: crows_pairs_french.yaml
+task: crows_pairs_french_nationality
+dataset_name:
+process_docs: !function utils.filter_nationality
@@ -0,0 +1,4 @@
+include: crows_pairs_french.yaml
+task: crows_pairs_french_physical_appearance
+dataset_name:
+process_docs: !function utils.filter_appearance
@@ -0,0 +1,4 @@
+include: crows_pairs_french.yaml
+task: crows_pairs_french_race_color
+dataset_name:
+process_docs: !function utils.filter_race_color
@@ -0,0 +1,4 @@
+include: crows_pairs_french.yaml
+task: crows_pairs_french_religion
+dataset_name:
+process_docs: !function utils.filter_religion
@@ -0,0 +1,4 @@
+include: crows_pairs_french.yaml
+task: crows_pairs_french_sexual_orientation
+dataset_name:
+process_docs: !function utils.filter_orientation