flowaicom · sariola · Oct 10, 2024 · Oct 10, 2024
diff --git a/flow_judge/flow_judge.py b/flow_judge/flow_judge.py
@@ -4,7 +4,6 @@
 from flow_judge.eval_data_types import EvalInput, EvalOutput
 from flow_judge.metrics import CustomMetric, Metric
 from flow_judge.models.common import AsyncBaseFlowJudgeModel, BaseFlowJudgeModel
-from flow_judge.utils.prompt_formatter import format_rubric, format_user_prompt, format_vars
 from flow_judge.utils.result_writer import write_results_to_disk
 from flow_judge.utils.validators import validate_eval_input
 
@@ -19,7 +18,7 @@ def __init__(
         self,
         metric: Metric | CustomMetric,
         model: BaseFlowJudgeModel | AsyncBaseFlowJudgeModel,
-        output_dir: str | None = "output/",
+        output_dir: str = "output/",
     ):
         """Initialize BaseFlowJudge with a metric and model."""
         if not isinstance(metric, (Metric, CustomMetric)):
@@ -30,13 +29,7 @@ def __init__(
 
     def _format_prompt(self, eval_input: EvalInput) -> str:
         """Format the prompt for a single evaluation input."""
-        prompt_variables = {
-            "INPUTS": format_vars(eval_input.inputs),
-            "OUTPUT": format_vars([eval_input.output]),
-            "EVALUATION_CRITERIA": self.metric.criteria,
-            "RUBRIC": format_rubric(self.metric.rubric),
-        }
-        return format_user_prompt(prompt_variables)
+        return self.metric.format_prompt(eval_input.dict())
 
     def _validate_inputs(self, eval_inputs: EvalInput | list[EvalInput]):
         """Validate required inputs and output against the metric."""
@@ -102,7 +95,6 @@ def batch_evaluate(
             self._save_results(eval_inputs, eval_outputs)
         if parse_failures > 0:
             logger.warning(f"Number of parsing failures: {parse_failures} out of {len(responses)}")
-
         return eval_outputs
 
 
@@ -152,8 +144,6 @@ async def async_batch_evaluate(
         parse_failures = sum(1 for output in eval_outputs if output.score == -1)
         if save_results:
             await asyncio.to_thread(self._save_results, eval_inputs, eval_outputs)
-
         if parse_failures > 0:
             logger.warning(f"Number of parsing failures: {parse_failures} out of {len(responses)}")
-
         return eval_outputs
diff --git a/flow_judge/metrics/README.md b/flow_judge/metrics/README.md
@@ -0,0 +1,66 @@
+## Example Rubrics and Requesting New Ones
+
+We provide a collection of example rubrics in the `example_rubrics` directory. These rubrics are written in YAML format for easy customization and integration into your evaluation workflows.
+
+### Browsing Example Rubrics
+
+You can find example rubrics for various evaluation tasks in the `example_rubrics` directory. Each rubric is stored as a YAML file and includes the following information:
+
+- `name`: A unique identifier for the rubric
+- `description`: A brief description of what the rubric evaluates
+- `criteria`: The main evaluation criteria
+- `rubric`: A list of scoring options with descriptions
+- `required_inputs`: The inputs required for the evaluation
+- `required_output`: The output to be evaluated
+
+### Requesting New Rubrics
+
+To request a new rubric or modifications to existing ones, you can use our built-in tools:
+
+#### From CLI:
+
+```bash
+flow-judge create-rubric-request
+```
+
+This interactive command will guide you through the process of creating a new rubric request.
+
+#### From Jupyter Notebook:
+
+```python
+from flow_judge.notebook_utils import display_rubric_request_form
+
+display_rubric_request_form()
+```
+
+This will display an interactive form in your notebook for creating a new rubric request.
+
+#### Programmatically:
+
+```python
+from flow_judge.rubric_utils import request_rubric
+
+request_rubric(
+    title="Your Rubric Title",
+    description="Brief description of the rubric",
+    similar_to="existing_rubric_name",  # Optional
+    custom_fields={"key": "value"}  # Optional
+)
+```
+
+This will open a pre-filled GitHub issue in your browser, making it easy to submit your request. The issue will include:
+
+- A proposed structure for the new rubric
+- Reference to a similar existing rubric (if specified)
+- A list of all existing rubrics for context
+- Any additional custom fields you've provided
+
+By using these tools, you can easily contribute to the growth and improvement of the `flow-judge` library's evaluation capabilities.
+
+
+```python
+from flow_judge.rubric_loader import create_judge_from_yaml
+
+judge = create_judge_from_yaml('path/to/rubric.yaml', model_type='vllm')
+result = judge.evaluate(eval_input)
+```
diff --git a/flow_judge/metrics/_data/metrics/article_evaluation/clarity_binary.yaml b/flow_judge/metrics/_data/metrics/article_evaluation/clarity_binary.yaml
@@ -0,0 +1,12 @@
+name: article_clarity
+description: Evaluates the clarity and readability of an article
+criteria: Does the article's writing quality in terms of clarity, conciseness, and ease of understanding communicate effectively the information to the reader?
+rubric:
+  - score: 0
+    description: The article's writing quality is poor to moderate in terms of clarity, conciseness, and ease of understanding. It may have confusing sentence structures, inappropriate vocabulary, lack of organization, or instances of unnecessary verbosity. The writing does not effectively communicate the information to the reader, making it difficult to comprehend the content without significant effort.
+  - score: 1
+    description: The article's writing quality is high in terms of clarity, conciseness, and ease of understanding. It features well-constructed sentences, appropriate vocabulary, logical organization, and efficient conveyance of information. The writing effectively communicates the information to the reader, allowing for easy comprehension and a smooth reading experience.
+required_inputs:
+  - user_instructions
+  - context
+required_output: article
diff --git a/flow_judge/metrics/_data/metrics/article_evaluation/completeness_binary.yaml b/flow_judge/metrics/_data/metrics/article_evaluation/completeness_binary.yaml
@@ -0,0 +1,12 @@
+name: article_completeness
+description: Evaluates the completeness of an article based on provided instructions and context
+criteria: Evaluate the extent to which the article provides comprehensive coverage of all topics, key points, and information specified in the instructions, ensuring that no relevant aspects are omitted or inadequately addressed.
+rubric:
+  - score: 0
+    description: The article fails to provide comprehensive coverage of the required topics, key points, and information specified in the instructions. It omits crucial information and has significant gaps in addressing relevant aspects.
+  - score: 1
+    description: The article offers comprehensive coverage of all required topics, key points, and information specified in the instructions. It thoroughly addresses all relevant aspects, providing in-depth information and leaving no significant gaps in coverage.
+required_inputs:
+  - user_instructions
+  - context
+required_output: article
diff --git a/flow_judge/metrics/_data/metrics/article_evaluation/objectivity_binary.yaml b/flow_judge/metrics/_data/metrics/article_evaluation/objectivity_binary.yaml
@@ -0,0 +1,12 @@
+name: article_objectivity
+description: Evaluates the objectivity and balance of an article
+criteria: Evaluate whether the article presents information in an unbiased manner by incorporating multiple perspectives fairly and avoiding partisan or one-sided reporting.
+rubric:
+  - score: 0
+    description: The article shows significant bias in its reporting. It either presents only one perspective or heavily favors a particular viewpoint. Alternative views are absent, minimized, or unfairly represented. The language used may be loaded or emotionally charged, and sources may be limited to those supporting a single perspective. The overall presentation lacks journalistic objectivity and balance.
+  - score: 1
+    description: The article demonstrates a commitment to unbiased reporting. It presents multiple perspectives on the topic, giving fair representation to different viewpoints. The language used is neutral and objective, avoiding loaded terms or emotional rhetoric. The article uses a diverse range of credible sources to support various perspectives. While minor imperfections may exist, the overall presentation maintains journalistic integrity, balance, and objectivity.
+required_inputs:
+  - user_instructions
+  - context
+required_output: article
diff --git a/flow_judge/metrics/_data/metrics/article_evaluation/source_attribution_binary.yaml b/flow_judge/metrics/_data/metrics/article_evaluation/source_attribution_binary.yaml
@@ -0,0 +1,12 @@
+name: article_source_attribution
+description: Evaluates the accuracy and comprehensiveness of source attribution in an article
+criteria: Does the article accurately and comprehensively attribute information to reliable sources, ensuring that these sources align with those provided in the information and instructions?
+rubric:
+  - score: 0
+    description: The article fails to accurately and comprehensively attribute information to reliable sources. There are significant gaps or inaccuracies in attribution, and many sources either do not align with those provided in the instructions or are unreliable. Attribution practices are inconsistent or inadequate, with key information often lacking proper sourcing.
+  - score: 1
+    description: The article accurately and comprehensively attributes information to reliable sources that align with those provided in the information and instructions. Attribution practices are consistently followed throughout the article, with all key information properly sourced and credited. The sourcing is appropriate and demonstrates excellent adherence to attribution standards.
+required_inputs:
+  - user_instructions
+  - context
+required_output: article
diff --git a/flow_judge/metrics/_data/metrics/query_decomposition/sub_query_coverage_3point.yaml b/flow_judge/metrics/_data/metrics/query_decomposition/sub_query_coverage_3point.yaml
@@ -0,0 +1,13 @@
+name: sub_query_coverage
+description: Evaluates the coverage and relevance of sub-queries generated from a main query
+criteria: Do the generated sub-queries provide sufficient breadth to cover all aspects of the main query?
+rubric:
+  - score: 1
+    description: The sub-queries lack breadth and fail to address multiple important aspects of the main query. They are either too narrow, focusing on only one or two dimensions of the question, or they diverge significantly from the main query's intent. Using these sub-queries alone would result in a severely limited exploration of the topic.
+  - score: 2
+    description: The sub-queries cover some aspects of the main query but lack comprehensive breadth. While they touch on several dimensions of the question, there are still noticeable gaps in coverage. Some important facets of the main query are either underrepresented or missing entirely. The sub-queries provide a partial, but incomplete, exploration of the topic.
+  - score: 3
+    description: The sub-queries demonstrate excellent breadth, effectively covering all major aspects of the main query. They break down the main question into a diverse set of dimensions, ensuring a comprehensive exploration of the topic. Each significant facet of the main query is represented in the sub-queries, allowing for a thorough and well-rounded investigation of the subject matter.
+required_inputs:
+  - query
+required_output: sub_queries
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/correctness_3point.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/correctness_3point.yaml
@@ -0,0 +1,14 @@
+name: response_correctness_3point
+description: Evaluates the correctness of a response using a 3-point Likert scale
+criteria: Based on the provided reference response, how well does the system's generated response match the correct answer to the given query?
+rubric:
+  - score: 1
+    description: The generated response does not match the reference response at all. It either fails to address the query or provides a completely incorrect answer. The information presented is irrelevant, inaccurate, or entirely misses the point of the question. Using this response would lead to a fundamental misunderstanding of the topic.
+  - score: 2
+    description: The generated response partially matches the reference response. It addresses the query but may contain some incorrect, irrelevant or incomplete information compared to the reference. While some aspects of the answer are correct, there are noticeable gaps, inaccuracies, or misinterpretations that prevent it from being fully correct. The response demonstrates a partial understanding of the topic but falls short of a comprehensive and accurate answer.
+  - score: 3
+    description: The generated response fully matches the reference response. It accurately and completely answers the query, containing all the relevant information from the reference without any incorrect or extraneous details. The response demonstrates a thorough understanding of the topic and provides a comprehensive answer that aligns perfectly with the reference. Any variations in wording or structure do not detract from the accuracy or completeness of the information presented.
+required_inputs:
+  - query
+  - reference_answer
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/correctness_5point.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/correctness_5point.yaml
@@ -0,0 +1,18 @@
+name: response_correctness_5point
+description: Evaluates the correctness of a response using a 5-point Likert scale
+criteria: Compare the system's response to the provided reference answer and rate how well they match in accuracy and completeness to answer the query.
+rubric:
+  - score: 1
+    description: The response is completely incorrect or irrelevant to the query, with no overlap in information with the reference answer. It fails to address the question entirely or provides information that is entirely unrelated or contradictory to the correct answer. Using this response would lead to a complete misunderstanding of the topic.
+  - score: 2
+    description: The response contains some correct information relevant to the query but is substantially incomplete or inaccurate compared to the reference answer. While there may be elements of truth, the majority of the response is either incorrect, missing crucial information, or so poorly articulated that it fails to effectively answer the query. The inaccuracies or omissions significantly impair the usefulness of the response.
+  - score: 3
+    description: The response answers the query with reasonable accuracy but is missing key details or has minor inaccuracies compared to the reference. It demonstrates a basic understanding of the topic and provides some correct information, but falls short of a comprehensive answer. The response may lack depth, omit important nuances, or contain minor errors that, while not completely undermining the answer, detract from its overall quality and completeness.
+  - score: 4
+    description: The response accurately answers the query and is nearly complete, only leaving out non-essential details compared to the reference. It demonstrates a strong understanding of the topic and provides a thorough answer that covers all major points. Any omissions are minor and do not significantly impact the overall quality or usefulness of the response. The information presented is accurate and well-articulated, with only minor room for improvement in terms of completeness or detail.
+  - score: 5
+    description: The response perfectly matches the accuracy and level of detail of the reference answer, containing all key information to comprehensively answer the query. It demonstrates a complete and nuanced understanding of the topic, providing a response that is indistinguishable in quality and content from the reference answer. All relevant details, including subtle points or exceptions, are accurately presented. The response leaves no room for improvement in terms of accuracy, completeness, or relevance to the query.
+required_inputs:
+  - query
+  - reference_answer
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/correctness_binary.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/correctness_binary.yaml
@@ -0,0 +1,12 @@
+name: response_correctness_binary
+description: Evaluates the correctness of a response in a binary manner
+criteria: Does the generated response accurately match the provided reference answer for the given query?
+rubric:
+  - score: 0
+    description: The response is incorrect or irrelevant. It either contains inaccurate information, fails to address the query adequately, or provides information that does not align with the reference answer. The response may be off-topic, incomplete, or contain significant errors that render it unsuitable as an answer to the given query.
+  - score: 1
+    description: The response is correct and relevant. It accurately addresses the query and aligns closely with the provided reference answer. The information presented is factual, complete, and directly pertinent to the question asked. While the wording may not be identical to the reference answer, the core content and meaning are equivalent, demonstrating a correct understanding and articulation of the required information.
+required_inputs:
+  - query
+  - reference_answer
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_3point.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_3point.yaml
@@ -0,0 +1,14 @@
+name: response_faithfulness_3point
+description: Evaluates the faithfulness of a response to the provided context using a 3-point Likert scale
+criteria: Based on the provided context, assess how faithful and consistent the response is to the information given. Check if the response contains any fabricated or hallucinated content that cannot be supported by the context.
+rubric:
+  - score: 1
+    description: The response contains significant amount of fabricated information or unsupported claims that directly contradict or deviate from the given context. Major hallucinations are present that are not factual based on the context provided. The response introduces substantial new information, makes claims that cannot be inferred from the context, or presents ideas that are inconsistent with the given material. The level of unfaithfulness severely compromises the reliability and accuracy of the response.
+  - score: 2
+    description: The response is mostly faithful to the context, but contains some minor unsupported details or slight factual inconsistencies. While the overall message is supported, there are a few deviations that are not directly inferable from the strict context alone. These may include minor embellishments, slight exaggerations, or small details that, while not entirely contradictory, go somewhat beyond what the context explicitly supports. The response remains largely reliable, but with some caveats.
+  - score: 3
+    description: The response is completely faithful and consistent with the context provided. All details and claims are directly supported by the information given, without any hallucinated or fabricated content present. The response accurately represents only the facts in the context, making no unsupported claims or inferences. Any conclusions or interpretations are strictly based on the provided information, demonstrating complete faithfulness to the context. The response can be fully trusted as an accurate representation of the given material.
+required_inputs:
+  - query
+  - context
+required_output: response