diff --git a/flow_judge/flow_judge.py b/flow_judge/flow_judge.py
index acb958e..90959ee 100644
--- a/flow_judge/flow_judge.py
+++ b/flow_judge/flow_judge.py
@@ -4,7 +4,6 @@
from flow_judge.eval_data_types import EvalInput, EvalOutput
from flow_judge.metrics import CustomMetric, Metric
from flow_judge.models.common import AsyncBaseFlowJudgeModel, BaseFlowJudgeModel
-from flow_judge.utils.prompt_formatter import format_rubric, format_user_prompt, format_vars
from flow_judge.utils.result_writer import write_results_to_disk
from flow_judge.utils.validators import validate_eval_input
@@ -19,7 +18,7 @@ def __init__(
self,
metric: Metric | CustomMetric,
model: BaseFlowJudgeModel | AsyncBaseFlowJudgeModel,
- output_dir: str | None = "output/",
+ output_dir: str = "output/",
):
"""Initialize BaseFlowJudge with a metric and model."""
if not isinstance(metric, (Metric, CustomMetric)):
@@ -30,13 +29,7 @@ def __init__(
def _format_prompt(self, eval_input: EvalInput) -> str:
"""Format the prompt for a single evaluation input."""
- prompt_variables = {
- "INPUTS": format_vars(eval_input.inputs),
- "OUTPUT": format_vars([eval_input.output]),
- "EVALUATION_CRITERIA": self.metric.criteria,
- "RUBRIC": format_rubric(self.metric.rubric),
- }
- return format_user_prompt(prompt_variables)
+ return self.metric.format_prompt(eval_input.dict())
def _validate_inputs(self, eval_inputs: EvalInput | list[EvalInput]):
"""Validate required inputs and output against the metric."""
@@ -102,7 +95,6 @@ def batch_evaluate(
self._save_results(eval_inputs, eval_outputs)
if parse_failures > 0:
logger.warning(f"Number of parsing failures: {parse_failures} out of {len(responses)}")
-
return eval_outputs
@@ -152,8 +144,6 @@ async def async_batch_evaluate(
parse_failures = sum(1 for output in eval_outputs if output.score == -1)
if save_results:
await asyncio.to_thread(self._save_results, eval_inputs, eval_outputs)
-
if parse_failures > 0:
logger.warning(f"Number of parsing failures: {parse_failures} out of {len(responses)}")
-
return eval_outputs
diff --git a/flow_judge/metrics/README.md b/flow_judge/metrics/README.md
new file mode 100644
index 0000000..d5797d7
--- /dev/null
+++ b/flow_judge/metrics/README.md
@@ -0,0 +1,66 @@
+## Example Rubrics and Requesting New Ones
+
+We provide a collection of example rubrics in the `example_rubrics` directory. These rubrics are written in YAML format for easy customization and integration into your evaluation workflows.
+
+### Browsing Example Rubrics
+
+You can find example rubrics for various evaluation tasks in the `example_rubrics` directory. Each rubric is stored as a YAML file and includes the following information:
+
+- `name`: A unique identifier for the rubric
+- `description`: A brief description of what the rubric evaluates
+- `criteria`: The main evaluation criteria
+- `rubric`: A list of scoring options with descriptions
+- `required_inputs`: The inputs required for the evaluation
+- `required_output`: The output to be evaluated
+
+### Requesting New Rubrics
+
+To request a new rubric or modifications to existing ones, you can use our built-in tools:
+
+#### From CLI:
+
+```bash
+flow-judge create-rubric-request
+```
+
+This interactive command will guide you through the process of creating a new rubric request.
+
+#### From Jupyter Notebook:
+
+```python
+from flow_judge.notebook_utils import display_rubric_request_form
+
+display_rubric_request_form()
+```
+
+This will display an interactive form in your notebook for creating a new rubric request.
+
+#### Programmatically:
+
+```python
+from flow_judge.rubric_utils import request_rubric
+
+request_rubric(
+ title="Your Rubric Title",
+ description="Brief description of the rubric",
+ similar_to="existing_rubric_name", # Optional
+ custom_fields={"key": "value"} # Optional
+)
+```
+
+This will open a pre-filled GitHub issue in your browser, making it easy to submit your request. The issue will include:
+
+- A proposed structure for the new rubric
+- Reference to a similar existing rubric (if specified)
+- A list of all existing rubrics for context
+- Any additional custom fields you've provided
+
+By using these tools, you can easily contribute to the growth and improvement of the `flow-judge` library's evaluation capabilities.
+
+
+```python
+from flow_judge.rubric_loader import create_judge_from_yaml
+
+judge = create_judge_from_yaml('path/to/rubric.yaml', model_type='vllm')
+result = judge.evaluate(eval_input)
+```
diff --git a/flow_judge/metrics/_data/metrics/article_evaluation/clarity_binary.yaml b/flow_judge/metrics/_data/metrics/article_evaluation/clarity_binary.yaml
new file mode 100644
index 0000000..ae6c655
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/article_evaluation/clarity_binary.yaml
@@ -0,0 +1,12 @@
+name: article_clarity
+description: Evaluates the clarity and readability of an article
+criteria: Does the article's writing quality in terms of clarity, conciseness, and ease of understanding communicate effectively the information to the reader?
+rubric:
+ - score: 0
+ description: The article's writing quality is poor to moderate in terms of clarity, conciseness, and ease of understanding. It may have confusing sentence structures, inappropriate vocabulary, lack of organization, or instances of unnecessary verbosity. The writing does not effectively communicate the information to the reader, making it difficult to comprehend the content without significant effort.
+ - score: 1
+ description: The article's writing quality is high in terms of clarity, conciseness, and ease of understanding. It features well-constructed sentences, appropriate vocabulary, logical organization, and efficient conveyance of information. The writing effectively communicates the information to the reader, allowing for easy comprehension and a smooth reading experience.
+required_inputs:
+ - user_instructions
+ - context
+required_output: article
diff --git a/flow_judge/metrics/_data/metrics/article_evaluation/completeness_binary.yaml b/flow_judge/metrics/_data/metrics/article_evaluation/completeness_binary.yaml
new file mode 100644
index 0000000..5515011
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/article_evaluation/completeness_binary.yaml
@@ -0,0 +1,12 @@
+name: article_completeness
+description: Evaluates the completeness of an article based on provided instructions and context
+criteria: Evaluate the extent to which the article provides comprehensive coverage of all topics, key points, and information specified in the instructions, ensuring that no relevant aspects are omitted or inadequately addressed.
+rubric:
+ - score: 0
+ description: The article fails to provide comprehensive coverage of the required topics, key points, and information specified in the instructions. It omits crucial information and has significant gaps in addressing relevant aspects.
+ - score: 1
+ description: The article offers comprehensive coverage of all required topics, key points, and information specified in the instructions. It thoroughly addresses all relevant aspects, providing in-depth information and leaving no significant gaps in coverage.
+required_inputs:
+ - user_instructions
+ - context
+required_output: article
diff --git a/flow_judge/metrics/_data/metrics/article_evaluation/objectivity_binary.yaml b/flow_judge/metrics/_data/metrics/article_evaluation/objectivity_binary.yaml
new file mode 100644
index 0000000..fdfc596
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/article_evaluation/objectivity_binary.yaml
@@ -0,0 +1,12 @@
+name: article_objectivity
+description: Evaluates the objectivity and balance of an article
+criteria: Evaluate whether the article presents information in an unbiased manner by incorporating multiple perspectives fairly and avoiding partisan or one-sided reporting.
+rubric:
+ - score: 0
+ description: The article shows significant bias in its reporting. It either presents only one perspective or heavily favors a particular viewpoint. Alternative views are absent, minimized, or unfairly represented. The language used may be loaded or emotionally charged, and sources may be limited to those supporting a single perspective. The overall presentation lacks journalistic objectivity and balance.
+ - score: 1
+ description: The article demonstrates a commitment to unbiased reporting. It presents multiple perspectives on the topic, giving fair representation to different viewpoints. The language used is neutral and objective, avoiding loaded terms or emotional rhetoric. The article uses a diverse range of credible sources to support various perspectives. While minor imperfections may exist, the overall presentation maintains journalistic integrity, balance, and objectivity.
+required_inputs:
+ - user_instructions
+ - context
+required_output: article
diff --git a/flow_judge/metrics/_data/metrics/article_evaluation/source_attribution_binary.yaml b/flow_judge/metrics/_data/metrics/article_evaluation/source_attribution_binary.yaml
new file mode 100644
index 0000000..14dd09d
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/article_evaluation/source_attribution_binary.yaml
@@ -0,0 +1,12 @@
+name: article_source_attribution
+description: Evaluates the accuracy and comprehensiveness of source attribution in an article
+criteria: Does the article accurately and comprehensively attribute information to reliable sources, ensuring that these sources align with those provided in the information and instructions?
+rubric:
+ - score: 0
+ description: The article fails to accurately and comprehensively attribute information to reliable sources. There are significant gaps or inaccuracies in attribution, and many sources either do not align with those provided in the instructions or are unreliable. Attribution practices are inconsistent or inadequate, with key information often lacking proper sourcing.
+ - score: 1
+ description: The article accurately and comprehensively attributes information to reliable sources that align with those provided in the information and instructions. Attribution practices are consistently followed throughout the article, with all key information properly sourced and credited. The sourcing is appropriate and demonstrates excellent adherence to attribution standards.
+required_inputs:
+ - user_instructions
+ - context
+required_output: article
diff --git a/flow_judge/metrics/_data/metrics/query_decomposition/sub_query_coverage_3point.yaml b/flow_judge/metrics/_data/metrics/query_decomposition/sub_query_coverage_3point.yaml
new file mode 100644
index 0000000..56180d3
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/query_decomposition/sub_query_coverage_3point.yaml
@@ -0,0 +1,13 @@
+name: sub_query_coverage
+description: Evaluates the coverage and relevance of sub-queries generated from a main query
+criteria: Do the generated sub-queries provide sufficient breadth to cover all aspects of the main query?
+rubric:
+ - score: 1
+ description: The sub-queries lack breadth and fail to address multiple important aspects of the main query. They are either too narrow, focusing on only one or two dimensions of the question, or they diverge significantly from the main query's intent. Using these sub-queries alone would result in a severely limited exploration of the topic.
+ - score: 2
+ description: The sub-queries cover some aspects of the main query but lack comprehensive breadth. While they touch on several dimensions of the question, there are still noticeable gaps in coverage. Some important facets of the main query are either underrepresented or missing entirely. The sub-queries provide a partial, but incomplete, exploration of the topic.
+ - score: 3
+ description: The sub-queries demonstrate excellent breadth, effectively covering all major aspects of the main query. They break down the main question into a diverse set of dimensions, ensuring a comprehensive exploration of the topic. Each significant facet of the main query is represented in the sub-queries, allowing for a thorough and well-rounded investigation of the subject matter.
+required_inputs:
+ - query
+required_output: sub_queries
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/correctness_3point.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/correctness_3point.yaml
new file mode 100644
index 0000000..6f6f2eb
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/response_evaluation/correctness_3point.yaml
@@ -0,0 +1,14 @@
+name: response_correctness_3point
+description: Evaluates the correctness of a response using a 3-point Likert scale
+criteria: Based on the provided reference response, how well does the system's generated response match the correct answer to the given query?
+rubric:
+ - score: 1
+ description: The generated response does not match the reference response at all. It either fails to address the query or provides a completely incorrect answer. The information presented is irrelevant, inaccurate, or entirely misses the point of the question. Using this response would lead to a fundamental misunderstanding of the topic.
+ - score: 2
+ description: The generated response partially matches the reference response. It addresses the query but may contain some incorrect, irrelevant or incomplete information compared to the reference. While some aspects of the answer are correct, there are noticeable gaps, inaccuracies, or misinterpretations that prevent it from being fully correct. The response demonstrates a partial understanding of the topic but falls short of a comprehensive and accurate answer.
+ - score: 3
+ description: The generated response fully matches the reference response. It accurately and completely answers the query, containing all the relevant information from the reference without any incorrect or extraneous details. The response demonstrates a thorough understanding of the topic and provides a comprehensive answer that aligns perfectly with the reference. Any variations in wording or structure do not detract from the accuracy or completeness of the information presented.
+required_inputs:
+ - query
+ - reference_answer
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/correctness_5point.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/correctness_5point.yaml
new file mode 100644
index 0000000..a49b098
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/response_evaluation/correctness_5point.yaml
@@ -0,0 +1,18 @@
+name: response_correctness_5point
+description: Evaluates the correctness of a response using a 5-point Likert scale
+criteria: Compare the system's response to the provided reference answer and rate how well they match in accuracy and completeness to answer the query.
+rubric:
+ - score: 1
+ description: The response is completely incorrect or irrelevant to the query, with no overlap in information with the reference answer. It fails to address the question entirely or provides information that is entirely unrelated or contradictory to the correct answer. Using this response would lead to a complete misunderstanding of the topic.
+ - score: 2
+ description: The response contains some correct information relevant to the query but is substantially incomplete or inaccurate compared to the reference answer. While there may be elements of truth, the majority of the response is either incorrect, missing crucial information, or so poorly articulated that it fails to effectively answer the query. The inaccuracies or omissions significantly impair the usefulness of the response.
+ - score: 3
+ description: The response answers the query with reasonable accuracy but is missing key details or has minor inaccuracies compared to the reference. It demonstrates a basic understanding of the topic and provides some correct information, but falls short of a comprehensive answer. The response may lack depth, omit important nuances, or contain minor errors that, while not completely undermining the answer, detract from its overall quality and completeness.
+ - score: 4
+ description: The response accurately answers the query and is nearly complete, only leaving out non-essential details compared to the reference. It demonstrates a strong understanding of the topic and provides a thorough answer that covers all major points. Any omissions are minor and do not significantly impact the overall quality or usefulness of the response. The information presented is accurate and well-articulated, with only minor room for improvement in terms of completeness or detail.
+ - score: 5
+ description: The response perfectly matches the accuracy and level of detail of the reference answer, containing all key information to comprehensively answer the query. It demonstrates a complete and nuanced understanding of the topic, providing a response that is indistinguishable in quality and content from the reference answer. All relevant details, including subtle points or exceptions, are accurately presented. The response leaves no room for improvement in terms of accuracy, completeness, or relevance to the query.
+required_inputs:
+ - query
+ - reference_answer
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/correctness_binary.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/correctness_binary.yaml
new file mode 100644
index 0000000..1ae15c3
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/response_evaluation/correctness_binary.yaml
@@ -0,0 +1,12 @@
+name: response_correctness_binary
+description: Evaluates the correctness of a response in a binary manner
+criteria: Does the generated response accurately match the provided reference answer for the given query?
+rubric:
+ - score: 0
+ description: The response is incorrect or irrelevant. It either contains inaccurate information, fails to address the query adequately, or provides information that does not align with the reference answer. The response may be off-topic, incomplete, or contain significant errors that render it unsuitable as an answer to the given query.
+ - score: 1
+ description: The response is correct and relevant. It accurately addresses the query and aligns closely with the provided reference answer. The information presented is factual, complete, and directly pertinent to the question asked. While the wording may not be identical to the reference answer, the core content and meaning are equivalent, demonstrating a correct understanding and articulation of the required information.
+required_inputs:
+ - query
+ - reference_answer
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_3point.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_3point.yaml
new file mode 100644
index 0000000..e387616
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_3point.yaml
@@ -0,0 +1,14 @@
+name: response_faithfulness_3point
+description: Evaluates the faithfulness of a response to the provided context using a 3-point Likert scale
+criteria: Based on the provided context, assess how faithful and consistent the response is to the information given. Check if the response contains any fabricated or hallucinated content that cannot be supported by the context.
+rubric:
+ - score: 1
+ description: The response contains significant amount of fabricated information or unsupported claims that directly contradict or deviate from the given context. Major hallucinations are present that are not factual based on the context provided. The response introduces substantial new information, makes claims that cannot be inferred from the context, or presents ideas that are inconsistent with the given material. The level of unfaithfulness severely compromises the reliability and accuracy of the response.
+ - score: 2
+ description: The response is mostly faithful to the context, but contains some minor unsupported details or slight factual inconsistencies. While the overall message is supported, there are a few deviations that are not directly inferable from the strict context alone. These may include minor embellishments, slight exaggerations, or small details that, while not entirely contradictory, go somewhat beyond what the context explicitly supports. The response remains largely reliable, but with some caveats.
+ - score: 3
+ description: The response is completely faithful and consistent with the context provided. All details and claims are directly supported by the information given, without any hallucinated or fabricated content present. The response accurately represents only the facts in the context, making no unsupported claims or inferences. Any conclusions or interpretations are strictly based on the provided information, demonstrating complete faithfulness to the context. The response can be fully trusted as an accurate representation of the given material.
+required_inputs:
+ - query
+ - context
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_5point.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_5point.yaml
new file mode 100644
index 0000000..8497340
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_5point.yaml
@@ -0,0 +1,18 @@
+name: response_faithfulness_5point
+description: Evaluates the faithfulness of a response to the provided context using a 5-point Likert scale
+criteria: Based on the given context, evaluate how consistent and faithful the generated response is to the context. The response should not contain any hallucinated or fabricated information that is not supported by the context.
+rubric:
+ - score: 1
+ description: The response is completely inconsistent with the provided context. It contains significant amount of hallucinated or fabricated information that directly contradicts or is not supported at all by the context. The response introduces major claims, facts, or ideas that have no basis in the given information. It demonstrates a severe lack of faithfulness to the context, potentially leading to complete misunderstanding or misinformation.
+ - score: 2
+ description: The response is mostly inconsistent with the provided context. While it may contain some information from the context, it introduces a substantial amount of hallucinated or fabricated details that deviate from the context. There are significant discrepancies between the response and the given information, with multiple instances of unsupported claims or contradictions. The overall faithfulness is poor, though some elements of accuracy may be present.
+ - score: 3
+ description: The response is somewhat consistent with the provided context. It includes a mix of information from the context and some hallucinated or fabricated details. The fabrications are minor and do not significantly contradict the context, but they do extend beyond what can be directly inferred. The response maintains a basic level of faithfulness, but with noticeable deviations that require caution when interpreting the information.
+ - score: 4
+ description: The response is mostly consistent with the provided context. The vast majority of the content is supported by the context, with only minor and inconsequential inconsistencies or fabrications, if any. Any deviations from the context are subtle and do not materially affect the overall message or accuracy of the response. The response demonstrates a high degree of faithfulness, with only slight room for improvement.
+ - score: 5
+ description: The response is completely consistent with and faithful to the provided context. All details in the response are directly supported by the context, without any hallucinated or fabricated information. The response accurately represents the given information, making only claims and inferences that are fully justified by the context. It demonstrates perfect faithfulness, ensuring that all information presented can be traced back to the original context without any embellishment or deviation.
+required_inputs:
+ - query
+ - context
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_binary.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_binary.yaml
new file mode 100644
index 0000000..004929e
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/response_evaluation/faithfulness_binary.yaml
@@ -0,0 +1,12 @@
+name: response_faithfulness_binary
+description: Evaluates the faithfulness of a response to the provided context in a binary manner
+criteria: Based on the provided context, does the response contain only information that is supported by or directly inferable from the context?
+rubric:
+ - score: 0
+ description: The response contains statements or claims that cannot be directly found in or logically inferred from the provided context. There is hallucinated or fabricated information present in the response that does not have support in the given context. This may include introducing new facts, making unsupported generalizations, or drawing conclusions that go beyond what the context allows. The response demonstrates a lack of faithfulness to the provided information, potentially misleading the reader or introducing inaccuracies.
+ - score: 1
+ description: The response contains only statements and claims that are directly stated in or logically inferable from the provided context. There is no hallucinated or fabricated information present in the response that cannot be traced back to or deduced from the context. The response demonstrates strict adherence to the given information, making only claims that are fully supported by the context. Any inferences or conclusions drawn are logical and well-grounded in the provided material, ensuring that the response remains faithful to the original context without introducing unsupported information.
+required_inputs:
+ - query
+ - context
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/relevance_3point.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/relevance_3point.yaml
new file mode 100644
index 0000000..d6a6c5a
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/response_evaluation/relevance_3point.yaml
@@ -0,0 +1,14 @@
+name: response_relevance_3point
+description: Evaluates the relevance of a response to the given query using a 3-point Likert scale
+criteria: How relevant and pertinent is the response to addressing the given query, without including extraneous or irrelevant information?
+rubric:
+ - score: 1
+ description: The response is not relevant to the query at all. It either does not address the key points of the query or includes only irrelevant or extraneous information that does not pertain to answering the query directly. The response may be entirely off-topic, focus on unrelated aspects, or provide information that has no bearing on the question asked. It fails to provide any useful or pertinent information in relation to the query, rendering it ineffective as an answer.
+ - score: 2
+ description: The response addresses some aspects of the query but is only partially relevant. It may go off-topic or include some tangentially related or extraneous information. Key points needed to comprehensively address the query are missing. While the response contains some relevant information, it is mixed with irrelevant details or fails to fully capture the essence of the query. The partial relevance limits the effectiveness of the response in fully answering the question at hand.
+ - score: 3
+ description: The response is highly relevant to the query and directly addresses all the key points needed to comprehensively answer the query. No irrelevant or extraneous information is included. The response is fully pertinent to the query, focusing precisely on the aspects required to provide a complete and accurate answer. It demonstrates a clear understanding of the query's requirements and delivers information that is both relevant and comprehensive, without any unnecessary digressions or omissions.
+required_inputs:
+ - query
+ - context
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/relevance_5point.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/relevance_5point.yaml
new file mode 100644
index 0000000..778e803
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/response_evaluation/relevance_5point.yaml
@@ -0,0 +1,18 @@
+name: response_relevance_5point
+description: Evaluates the relevance of a response to the given query using a 5-point Likert scale
+criteria: How well does the response address the query, providing relevant information without including anything extraneous or irrelevant?
+rubric:
+ - score: 1
+ description: The response is completely irrelevant to the query, does not address it at all, or contains only extraneous information unrelated to the query. It fails to provide any information that answers the question or addresses the topic at hand. The response may be entirely off-topic, discuss unrelated subjects, or provide information that has no connection to the query. It offers no value in terms of answering the original question.
+ - score: 2
+ description: The response is mostly irrelevant to the query, addressing it only tangentially or containing significant amounts of unrelated or extraneous information. While there may be small elements that relate to the query, the majority of the content does not contribute to answering the question. The response largely misses the point of the query, focusing on peripheral or unimportant aspects while neglecting the core issues that need to be addressed.
+ - score: 3
+ description: The response is somewhat relevant to the query, addressing the main point but going off-topic or including some extraneous details. Key aspects of the query may not be addressed. The response demonstrates a basic understanding of the query but lacks focus or precision. It may provide some useful information mixed with irrelevant details, or it might address only part of the query while ignoring other important elements. The overall relevance is moderate, with clear room for improvement.
+ - score: 4
+ description: The response is largely relevant to the query, addressing the key points without much extraneous information. It may not cover all aspects of the query exhaustively, but it provides a substantive and pertinent answer to the main issues raised. The majority of the information presented is directly related to the query, with only minor digressions or slightly extraneous details. The response demonstrates a good understanding of the query's requirements and delivers mostly relevant content.
+ - score: 5
+ description: The response is highly relevant to the query, addressing all key aspects directly and thoroughly without any irrelevant or extraneous information. It provides a comprehensive and focused answer that aligns perfectly with the query's requirements. Every part of the response contributes directly to addressing the question at hand, demonstrating a complete understanding of the query and delivering information that is entirely pertinent. The response is concise yet complete, offering maximum relevance to the given query.
+required_inputs:
+ - query
+ - context
+required_output: response
diff --git a/flow_judge/metrics/_data/metrics/response_evaluation/relevance_binary.yaml b/flow_judge/metrics/_data/metrics/response_evaluation/relevance_binary.yaml
new file mode 100644
index 0000000..2649ee0
--- /dev/null
+++ b/flow_judge/metrics/_data/metrics/response_evaluation/relevance_binary.yaml
@@ -0,0 +1,12 @@
+name: response_relevance_binary
+description: Evaluates the relevance of a response to the given query in a binary manner
+criteria: Is the response directly relevant to answering the query considering the context, without including irrelevant or extraneous information?
+rubric:
+ - score: 0
+ description: The response does not sufficiently address the query, either by failing to directly answer the question asked, going off-topic, or including irrelevant or extraneous information that was not requested in the original query. The response may contain accurate information, but it does not align with the specific requirements of the query. It might focus on tangential aspects, provide unnecessary background, or delve into unrelated topics. As a result, the response fails to provide a clear, focused answer to the question at hand.
+ - score: 1
+ description: The response directly and sufficiently addresses the query. All of the content is relevant to answering the question asked, without going off-topic or providing unnecessary additional information beyond what the query requires. The response maintains a clear focus on the specific aspects highlighted in the query, providing a concise and pertinent answer. It avoids extraneous details or tangential information, ensuring that every part of the response contributes directly to addressing the question at hand.
+required_inputs:
+ - query
+ - context
+required_output: response
diff --git a/flow_judge/metrics/_data/prompts/outputs_only.j2 b/flow_judge/metrics/_data/prompts/outputs_only.j2
new file mode 100644
index 0000000..b5d21d0
--- /dev/null
+++ b/flow_judge/metrics/_data/prompts/outputs_only.j2
@@ -0,0 +1,34 @@
+# GOAL
+Your job is to evaluate a task carried out by an AI system powered by a large language model.
+
+You will be provided the output of the task, as well as the evaluation criteria and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation criteria and scoring rubric provided.
+
+# OUTPUT
+Below is the output of the task:
+
+
+# EVALUATION CRITERIA AND SCORING RUBRIC
+Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
+
+{{EVALUATION_CRITERIA}}
+
+
+
+{{RUBRIC}}
+
+
+# INSTRUCTIONS FOR THE EVALUATION
+1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. Review the evaluation criteria and scoring rubric to understand the different levels of performance and the descriptions for each score.
+2. Review the output: Examine the output generated from completing the task.
+3. Compare output to score descriptions: Compare the output against the criteria and score descriptions in the scoring rubric. For each criterion,decide which description best matches the output.
+4. After comparing the output to the score descriptions, pay attention to the small details that might impact the final score that you assign. Sometimes a small difference can dictate the final score.
+5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring to specific aspects of the output and comparing them to the rubric.
+6. Assign a final score based on the scoring rubric.
+
+## FORMAT FOR THE EVALUATION
+- Write the verbal feedback inside tags without any additional surrounding text.
+- Write the numeric score inside tags, without any additional surrounding text and always after the feedback.
+
+Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric.
diff --git a/flow_judge/metrics/_data/prompts/standard.j2 b/flow_judge/metrics/_data/prompts/standard.j2
new file mode 100644
index 0000000..033140b
--- /dev/null
+++ b/flow_judge/metrics/_data/prompts/standard.j2
@@ -0,0 +1,40 @@
+# GOAL
+Your job is to evaluate a task carried out by an AI system powered by a large language model.
+
+You will be provided with the inputs and output of the task, as well as the evaluation criteria and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation criteria and scoring rubric provided.
+
+# INPUT
+Below are the inputs required for performing the task:
+
+{{INPUTS}}
+
+
+# OUTPUT
+Below is the output of the task:
+
+
+# EVALUATION CRITERIA AND SCORING RUBRIC
+Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
+
+{{EVALUATION_CRITERIA}}
+
+
+
+{{RUBRIC}}
+
+
+# INSTRUCTIONS FOR THE EVALUATION
+1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. Review the evaluation criteria and scoring rubric to understand the different levels of performance and the descriptions for each score.
+2. Review the inputs and output: Look at the inputs provided for the task. Examine the output generated from completing the task.
+3. Compare output to score descriptions: Compare the output against the criteria and score descriptions in the scoring rubric. For each criterion,decide which description best matches the output.
+4. After comparing the output to the score descriptions, pay attention to the small details that might impact the final score that you assign. Sometimes a small difference can dictate the final score.
+5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring to specific aspects of the output and comparing them to the rubric.
+6. Assign a final score based on the scoring rubric.
+
+## FORMAT FOR THE EVALUATION
+- Write the verbal feedback inside tags without any additional surrounding text.
+- Write the numeric score inside tags, without any additional surrounding text and always after the feedback.
+
+Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric.
diff --git a/flow_judge/metrics/metric.py b/flow_judge/metrics/metric.py
index dabe699..a8cf034 100644
--- a/flow_judge/metrics/metric.py
+++ b/flow_judge/metrics/metric.py
@@ -1,52 +1,260 @@
-from pydantic import BaseModel
+import importlib.resources as pkg_resources
+import warnings
+from pathlib import Path
+from typing import Any
+
+import yaml
+from jinja2 import Environment, FileSystemLoader, Template, TemplateError
+from pydantic import BaseModel, field_validator
+
+from flow_judge import metrics as metrics_module # Add this import
class RubricItem(BaseModel):
- """Represents an item in the evaluation rubric."""
+ """Represents an item in the evaluation rubric.
+
+ :param score: The score associated with this rubric item.
+ :type score: int
+ :param description: A detailed description of the criteria for this score.
+ :type description: str
+
+ :raises ValueError: If the score is negative.
+ :warns UserWarning: If the score is above 5, as Flow Judge v0.1 is
+ trained on 0-5 integer values.
+ """
score: int
description: str
+ @field_validator("score")
+ @classmethod
+ def validate_score(cls, v: int):
+ """Validate the score of a rubric item.
+
+ :param v: The score value to validate.
+ :type v: int
+ :return: The validated score.
+ :rtype: int
+ :raises ValueError: If the score is negative.
+ :warns UserWarning: If the score is above 5.
+ """
+ if v < 0:
+ raise ValueError("Score must be non-negative")
+ if v > 5:
+ warnings.warn(
+ "Flow Judge v0.1 has been trained with 0-5 integer values."
+ " Scores above 5 may lead to unexpected behavior.",
+ UserWarning,
+ stacklevel=2,
+ )
+ return v
+
class Metric(BaseModel):
- """Represents an evaluation metric."""
+ """Represents an evaluation metric.
+
+ :param name: The name of the metric.
+ :type name: str
+ :param description: A detailed description of the metric.
+ :type description: str
+ :param criteria: The evaluation criteria for this metric.
+ :type criteria: str
+ :param rubric: A list of RubricItems defining the scoring rubric.
+ :type rubric: List[RubricItem]
+ :param required_inputs: A list of required input fields for this metric.
+ :type required_inputs: List[str]
+ :param required_output: The name of the required output field.
+ :type required_output: str
+
+ :raises ValueError: If the rubric contains duplicate scores or if
+ scores are not in ascending order.
+ """
name: str
+ description: str
criteria: str
rubric: list[RubricItem]
- required_inputs: list[str] | None = None
+ required_inputs: list[str]
required_output: str
- def print_required_keys(self):
- """Prints the required input and output keys."""
- print(f"Metric: {self.name}")
- print("Required inputs:", ", ".join(self.required_inputs or []))
- print("Required output:", self.required_output)
+ @field_validator("rubric")
+ @classmethod
+ def check_rubric_scores(cls, values):
+ """Validate the rubric scores.
+ :param values: The values to validate.
+ :type values: dict
+ :return: The validated values.
+ :rtype: dict
+ :raises ValueError: If the rubric contains duplicate scores or if
+ scores are not in ascending order.
+ """
+ rubric = values.get("rubric")
+ if rubric:
+ scores = [item.score for item in rubric]
+ if len(scores) != len(set(scores)):
+ raise ValueError("Rubric contains duplicate scores")
+ if scores != sorted(scores):
+ raise ValueError("Rubric scores are not in ascending order")
+ return values
-class CustomMetric(Metric):
- """Represents a custom evaluation metric."""
-
- def __init__(
- self,
- name: str,
- criteria: str,
- rubric: list[RubricItem],
- required_inputs: list[str],
- required_output: str,
- ):
- """Initialize a custom metric."""
- super().__init__(
- name=name,
- criteria=criteria,
- rubric=rubric,
- required_inputs=required_inputs,
- required_output=required_output,
+ @classmethod
+ def from_yaml(cls, file_path: str) -> "Metric":
+ """Load a metric from a YAML file.
+
+ :param file_path: The path to the YAML file.
+ :type file_path: str
+ :return: A Metric instance.
+ :rtype: Metric
+ :raises ValueError: If the YAML is invalid or if there's an error
+ loading the metric.
+ :raises FileNotFoundError: If the specified file is not found.
+ """
+ try:
+ with open(file_path) as file:
+ data = yaml.safe_load(file)
+ return cls(**data)
+ except yaml.YAMLError as e:
+ raise ValueError(f"Invalid YAML in {file_path}: {str(e)}") from e
+ except FileNotFoundError as e:
+ raise FileNotFoundError(f"Metric file not found: {file_path}") from e
+ except Exception as e:
+ raise ValueError(f"Error loading metric from {file_path}: {str(e)}") from e
+
+ @classmethod
+ def load_all_from_directory(cls, directory: str = "_data/metrics") -> dict[str, "Metric"]:
+ """Load all metric templates from a directory.
+
+ :param directory: The directory path containing metric YAML files.
+ :type directory: str
+ :return: A dictionary of metric names to Metric instances.
+ :rtype: Dict[str, Metric]
+ """
+ metrics_dir = Path(directory)
+ if not metrics_dir.is_dir():
+ # If the directory doesn't exist, use the package data
+ metrics_dir = pkg_resources.files(metrics_module) / "_data" / "metrics"
+
+ metrics = {}
+ for file in metrics_dir.glob("**/*.yaml"):
+ try:
+ relative_path = file.relative_to(metrics_dir)
+ metric_name = relative_path.with_suffix("").as_posix()
+ metrics[metric_name] = cls.from_yaml(file)
+ except Exception as e:
+ print(f"Error loading metric from {file}: {str(e)}")
+ return metrics
+
+ @staticmethod
+ def load_prompt_template(template_name: str) -> str:
+ """Load a Jinja template from the prompts directory.
+
+ :param template_name: The name of the template file.
+ :type template_name: str
+ :return: The rendered template as a string.
+ :rtype: str
+ :raises ValueError: If there's an error in the template.
+ """
+ template_dir = Path("_data/prompts")
+ if not template_dir.is_dir():
+ # If the directory doesn't exist, use the package data
+ template_dir = pkg_resources.files(metrics_module) / "_data" / "prompts"
+
+ env = Environment(loader=FileSystemLoader(template_dir))
+ try:
+ template = env.get_template(template_name)
+ return template.render()
+ except TemplateError as e:
+ raise ValueError(f"Error in template {template_name}: {str(e)}") from e
+
+ def _format_inputs(self, eval_input: dict[str, Any]) -> str:
+ """Format all inputs with XML tags and concatenate them.
+
+ :param eval_input: A dictionary of input values.
+ :type eval_input: Dict[str, Any]
+ :return: A string of formatted inputs.
+ :rtype: str
+ """
+ formatted_inputs = []
+ for key, value in eval_input.items():
+ if key != self.required_output:
+ formatted_inputs.append(f"<{key}>\n{str(value)}\n{key}>")
+ return "\n".join(formatted_inputs)
+
+ def validate_inputs(self, eval_input: dict[str, Any]) -> None:
+ """Validate that all required inputs are present in the eval_input.
+
+ :param eval_input: A dictionary of input values.
+ :type eval_input: Dict[str, Any]
+ :raises ValueError: If any required inputs are missing.
+ """
+ missing_inputs = [
+ input_name for input_name in self.required_inputs if input_name not in eval_input
+ ]
+ if missing_inputs:
+ raise ValueError(f"Missing required inputs: {', '.join(missing_inputs)}")
+
+ def format_rubric(self) -> str:
+ """Format the rubric for the prompt.
+
+ :return: A string representation of the rubric.
+ :rtype: str
+ """
+ return "\n".join([f"- Score {item.score}: {item.description}" for item in self.rubric])
+
+ def format_prompt(self, eval_input: dict[str, Any]) -> str:
+ """Format the prompt for evaluation based on this metric.
+
+ :param eval_input: A dictionary of input values.
+ :type eval_input: Dict[str, Any]
+ :return: The formatted prompt string.
+ :rtype: str
+ :raises ValueError: If there's an error rendering the prompt or if
+ the prompt exceeds the maximum length.
+ """
+ self.validate_inputs(eval_input)
+
+ formatted_inputs = self._format_inputs(eval_input)
+ formatted_output = (
+ f"<{self.required_output}>\n"
+ + f"{eval_input.get(self.required_output, '[No output provided]')}"
+ + "\n{self.required_output}>"
)
+ prompt_variables = {
+ "INPUTS": formatted_inputs,
+ "OUTPUT": formatted_output,
+ "EVALUATION_CRITERIA": self.criteria,
+ "RUBRIC": self.format_rubric(),
+ }
+
+ template_name = "standard.j2" if formatted_inputs else "outputs_only.j2"
+ try:
+ template = Template(self.load_prompt_template(template_name))
+ rendered_prompt = template.render(**prompt_variables)
+ if len(rendered_prompt) > 100000: # Arbitrary limit, adjust as needed
+ raise ValueError("Rendered prompt exceeds maximum length")
+ return rendered_prompt
+ except Exception as e:
+ raise ValueError(f"Error rendering prompt: {str(e)}") from e
+
+
+class CustomMetric(Metric):
+ """Represents a custom evaluation metric.
+
+ This class inherits all functionality from the Metric class and can be
+ extended to implement custom behavior for specific evaluation needs.
+ """
+
+ pass
+
+
+def list_all_metrics() -> list[str]:
+ """List all metric variable names.
-def list_all_metrics():
- """List all metric variable names."""
+ :return: A list of metric names defined as global variables.
+ :rtype: List[str]
+ """
return [
name for name, value in globals().items() if isinstance(value, Metric) and name.isupper()
]
diff --git a/flow_judge/metrics/presets.py b/flow_judge/metrics/presets.py
deleted file mode 100644
index cb331b0..0000000
--- a/flow_judge/metrics/presets.py
+++ /dev/null
@@ -1,300 +0,0 @@
-from .metric import Metric, RubricItem
-
-# Pre-defined metrics
-RESPONSE_CORRECTNESS_BINARY = Metric(
- name="Response Correctness (Binary)",
- criteria="""Does the generated response accurately match the provided reference answer \
-for the given query?""",
- rubric=[
- RubricItem(
- score=0,
- description="""\
-The generated response does not match the reference answer. It either contains inaccurate \
-information, is missing key details from the reference, includes extra information not in the \
-reference, or fails to convey the same meaning as the reference answer.""",
- ),
- RubricItem(
- score=1,
- description="""\
-The generated response matches the reference answer exactly or contains all the key information \
-from the reference with no inaccuracies, extra details, or missing details. The meaning conveyed \
-by the generated response is equivalent to the reference.""",
- ),
- ],
- required_inputs=["query", "reference_answer"],
- required_output="response",
-)
-
-RESPONSE_CORRECTNESS_3POINT = Metric(
- name="Response Correctness (3-point Likert)",
- criteria="""\
-Based on the provided reference response, how well does the system's generated response match the \
-correct answer to the given query?""",
- rubric=[
- RubricItem(
- score=1,
- description="""\
-The generated response does not match the reference response at all. It either fails to address \
-the query or provides a completely incorrect answer.""",
- ),
- RubricItem(
- score=2,
- description="""\
-The generated response partially matches the reference response. It addresses the query but may \
-contain some incorrect, irrelevant or incomplete information compared to the reference.""",
- ),
- RubricItem(
- score=3,
- description="""\
-The generated response fully matches the reference response. It accurately and completely answers \
-the query, containing all the relevant information from the reference without any incorrect or \
-extraneous details.""",
- ),
- ],
- required_inputs=["query", "reference_answer"],
- required_output="response",
-)
-
-RESPONSE_CORRECTNESS_5POINT = Metric(
- name="Response Correctness (5-point Likert)",
- criteria="""\
-Compare the system's response to the provided reference answer and rate how well they match in \
-accuracy and completeness to answer the query.""",
- rubric=[
- RubricItem(
- score=1,
- description="""\
-The response is completely incorrect or irrelevant to the query, with no overlap in information \
-with the reference answer.""",
- ),
- RubricItem(
- score=2,
- description="""\
-The response contains some correct information relevant to the query but is substantially \
-incomplete or inaccurate compared to the reference answer.""",
- ),
- RubricItem(
- score=3,
- description="""\
-The response answers the query with reasonable accuracy but is missing key details or has minor \
-inaccuracies compared to the reference.""",
- ),
- RubricItem(
- score=4,
- description="""\
-The response accurately answers the query and is nearly complete, only leaving out non-essential \
-details compared to the reference.""",
- ),
- RubricItem(
- score=5,
- description="""\
-The response perfectly matches the accuracy and level of detail of the reference answer, \
-containing all key information to comprehensively answer the query.""",
- ),
- ],
- required_inputs=["query", "reference_answer"],
- required_output="response",
-)
-
-RESPONSE_FAITHFULNESS_BINARY = Metric(
- name="Response Faithfulness (Binary)",
- criteria="""\
-Based on the provided context, does the response contain only information that is supported by or \
-directly inferable from the context?""",
- rubric=[
- RubricItem(
- score=0,
- description="""\
-The response contains statements or claims that cannot be directly found in or logically inferred \
-from the provided context. There is hallucinated or fabricated information present in the response \
-that does not have support in the given context.""",
- ),
- RubricItem(
- score=1,
- description="""\
-The response contains only statements and claims that are directly stated in or logically \
-inferable from the provided context. There is no hallucinated or fabricated information present in \
-the response that cannot be traced back to or deduced from the context.""",
- ),
- ],
- required_inputs=["query", "context"],
- required_output="response",
-)
-
-RESPONSE_FAITHFULNESS_3POINT = Metric(
- name="Response Faithfulness (3-point Likert)",
- criteria="""\
-Based on the provided context, assess how faithful and consistent the response is to the \
-information given. Check if the response contains any fabricated or hallucinated content that \
-cannot be supported by the context.""",
- rubric=[
- RubricItem(
- score=1,
- description="""\
-The response contains significant amount of fabricated information or unsupported claims that \
-directly contradict or deviate from the given context. Major hallucinations are present that are \
-not factual based on the context provided.""",
- ),
- RubricItem(
- score=2,
- description="""\
-The response is mostly faithful to the context, but contains some minor unsupported details or \
-slight factual inconsistencies. While the overall message is supported, there are a few deviations \
-that are not directly inferable from the strict context alone.""",
- ),
- RubricItem(
- score=3,
- description="""\
-The response is completely faithful and consistent with the context provided. All details and \
-claims are directly supported by the information given, without any hallucinated or fabricated \
-content present. The response accurately represents only the facts in the context.""",
- ),
- ],
- required_inputs=["query", "context"],
- required_output="response",
-)
-
-RESPONSE_FAITHFULNESS_5POINT = Metric(
- name="Response Faithfulness (5-point Likert)",
- criteria="""\
-Based on the given context, evaluate how consistent and faithful the generated response is to the \
-context. The response should not contain any hallucinated or fabricated information that is not \
-supported by the context.""",
- rubric=[
- RubricItem(
- score=1,
- description="""\
-The response is completely inconsistent with the provided context. It contains significant amount \
-of hallucinated or fabricated information that directly contradicts or is not supported at all by \
-the context.""",
- ),
- RubricItem(
- score=2,
- description="""\
-The response is mostly inconsistent with the provided context. While it may contain some \
-information from the context, it introduces a substantial amount of hallucinated or fabricated \
-details that deviate from the context.""",
- ),
- RubricItem(
- score=3,
- description="""\
-The response is somewhat consistent with the provided context. It includes a mix of information \
-from the context and some hallucinated or fabricated details. The fabrications are minor and do \
-not significantly contradict the context.""",
- ),
- RubricItem(
- score=4,
- description="""\
-The response is mostly consistent with the provided context. The vast majority of the content is \
-supported by the context, with only minor and inconsequential inconsistencies or fabrications, if \
-any.""",
- ),
- RubricItem(
- score=5,
- description="""\
-The response is completely consistent with and faithful to the provided context. All details in \
-the response are directly supported by the context, without any hallucinated or fabricated \
-information.""",
- ),
- ],
- required_inputs=["query", "context"],
- required_output="response",
-)
-
-RESPONSE_RELEVANCE_BINARY = Metric(
- name="Response Relevance (Binary)",
- criteria="""\
-Is the response directly relevant to answering the query considering the context, without \
-including irrelevant or extraneous information?""",
- rubric=[
- RubricItem(
- score=0,
- description="""\
-The response does not sufficiently address the query, either by failing to directly answer the \
-question asked, going off-topic, or including irrelevant or extraneous information that was not \
-requested in the original query.""",
- ),
- RubricItem(
- score=1,
- description="""\
-The response directly and sufficiently addresses the query. All of the content is relevant to \
-answering the question asked, without going off-topic or providing unnecessary additional \
-information beyond what the query requires.""",
- ),
- ],
- required_inputs=["query", "context"],
- required_output="response",
-)
-
-RESPONSE_RELEVANCE_3POINT = Metric(
- name="Response Relevance (3-point Likert)",
- criteria="""\
-How relevant and pertinent is the response to addressing the given query, without including \
-extraneous or irrelevant information?""",
- rubric=[
- RubricItem(
- score=1,
- description="""\
-The response is not relevant to the query at all. It either does not address the key points of the \
-query or includes only irrelevant or extraneous information that does not pertain to answering the \
-query directly.""",
- ),
- RubricItem(
- score=2,
- description="""\
-The response addresses some aspects of the query but is only partially relevant. It may go \
-off-topic or include some tangentially related or extraneous information. Key points needed to \
-comprehensively address the query are missing.""",
- ),
- RubricItem(
- score=3,
- description="""\
-The response is highly relevant to the query and directly addresses all the key points needed to \
-comprehensively answer the query. No irrelevant or extraneous information is included. The \
-response is fully pertinent to the query.""",
- ),
- ],
- required_inputs=["query", "context"],
- required_output="response",
-)
-
-RESPONSE_RELEVANCE_5POINT = Metric(
- name="Response Relevance (5-point Likert)",
- criteria="""\
-How well does the response address the query, providing relevant information without including \
-anything extraneous or irrelevant?""",
- rubric=[
- RubricItem(
- score=1,
- description="""\
-The response is completely irrelevant to the query, does not address it at all, or contains only \
-extraneous information unrelated to the query.""",
- ),
- RubricItem(
- score=2,
- description="""\
-The response is mostly irrelevant to the query, addressing it only tangentially or containing \
-significant amounts of unrelated or extraneous information.""",
- ),
- RubricItem(
- score=3,
- description="""\
-The response is somewhat relevant to the query, addressing the main point but going off-topic or \
-including some extraneous details. Key aspects of the query may not be addressed.""",
- ),
- RubricItem(
- score=4,
- description="""\
-The response is largely relevant to the query, addressing the key points without much extraneous \
-information. It may not cover all aspects of the query exhaustively.""",
- ),
- RubricItem(
- score=5,
- description="""\
-The response is highly relevant to the query, addressing all key aspects directly and thoroughly \
-without any irrelevant or extraneous information.""",
- ),
- ],
- required_inputs=["query", "context"],
- required_output="response",
-)
diff --git a/flow_judge/models/vllm.py b/flow_judge/models/vllm.py
index a984794..bd57b51 100644
--- a/flow_judge/models/vllm.py
+++ b/flow_judge/models/vllm.py
@@ -199,7 +199,10 @@ def __init__(
**kwargs,
}
- os.environ["HF_HOME"] = download_dir
+ if download_dir:
+ os.environ["HF_HOME"] = download_dir
+ engine_args["download_dir"] = download_dir
+
if exec_async:
engine_args["disable_log_requests"] = kwargs.get("disable_log_requests", False)
self.engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**engine_args))
diff --git a/flow_judge/utils/cli.py b/flow_judge/utils/cli.py
new file mode 100644
index 0000000..7cc792d
--- /dev/null
+++ b/flow_judge/utils/cli.py
@@ -0,0 +1,27 @@
+import click
+
+from .notebook import load_rubric_templates, request_rubric
+
+
+@click.group()
+def cli():
+ """Command line interface for Flow Judge."""
+ pass
+
+
+@cli.command(name="request-rubric")
+@click.option("--title", prompt="Enter the title for the rubric request")
+@click.option("--description", prompt="Enter a description for the rubric")
+@click.option(
+ "--similar-to",
+ type=click.Choice(load_rubric_templates("rubrics").keys()),
+ help="Specify a similar existing rubric",
+)
+def request_rubric_command(title, description, similar_to):
+ """Create a new rubric request on GitHub."""
+ request_rubric(title, description, similar_to)
+ click.echo("Rubric request created successfully.")
+
+
+if __name__ == "__main__":
+ cli()
diff --git a/flow_judge/utils/notebook.py b/flow_judge/utils/notebook.py
new file mode 100644
index 0000000..56f1439
--- /dev/null
+++ b/flow_judge/utils/notebook.py
@@ -0,0 +1,141 @@
+from IPython.display import HTML, display
+
+
+def is_notebook() -> bool:
+ """Check if the current environment is a Jupyter notebook.
+
+ Returns:
+ bool: True if in a notebook, False otherwise.
+ """
+ try:
+ return get_ipython().__class__.__name__ == "ZMQInteractiveShell"
+ except NameError:
+ return False
+
+
+def request_rubric(
+ title: str,
+ description: str,
+ similar_to: str | None = None,
+ custom_fields: dict[str, str] | None = None,
+):
+ """Create a new rubric request and open it in the browser or display it in a notebook.
+
+ Args:
+ title (str): Title of the rubric request.
+ description (str): Description of the rubric request.
+ similar_to (str | None): Name of a similar existing rubric.
+ custom_fields (dict[str, str] | None): Additional custom fields for the request.
+ """
+ pass
+
+
+# issue_title = f"Rubric Request: {title}"
+# templates = load_rubric_templates("example_rubrics")
+# similar_template = templates.get(similar_to, None) if similar_to else None
+
+# issue_body = f"""
+# ## Rubric Request
+
+# **Title:** {title}
+
+# **Description:**
+# {description}
+
+# ## Similar Rubric
+# {f"This request is similar to the existing rubric: `{similar_to}`" if similar_to else "N/A"}
+
+# ## Proposed Structure
+# ```yaml
+# name: {title.lower().replace(' ', '_')}
+# description: {description}
+# criteria: [TO BE FILLED]
+# rubric:
+# - score: 0
+# description: [TO BE FILLED]
+# - score: 1
+# description: [TO BE FILLED]
+# required_inputs: {similar_template.required_inputs if similar_template else '[TO BE FILLED]'}
+# required_output: {similar_template.required_output if similar_template else '[TO BE FILLED]'}
+# ```
+
+# ## Additional Information
+# {yaml.dump(custom_fields) if custom_fields else "Please provide any additional context"
+# " or requirements for this rubric."}
+
+# ## Existing Rubrics for Reference
+# {yaml.dump({name: template.description for name, template in templates.items()})}
+# """
+
+# encoded_body = quote(issue_body)
+# url = f"https://github.com/flowaicom/flow-judge/issues/new?title={quote(issue_title)}&body={encoded_body}&labels=enhancement,rubric-request"
+
+# if is_notebook():
+# display(
+# HTML(
+# f"""
+#
+#
+#
+# """
+# )
+# )
+# else:
+# webbrowser.open(url)
+# print("Browser opened with the rubric request creation page.")
+
+
+def display_rubric_request_form():
+ """Display an interactive form for creating rubric requests in a Jupyter notebook."""
+ # templates = load_rubric_templates("example_rubrics")
+ # options = "".join([f'' for name in templates.keys()])
+ options = "placeholder"
+
+ form_html = f"""
+
+
+
+ """
+ display(HTML(form_html))
diff --git a/flow_judge/utils/prompt_formatter.py b/flow_judge/utils/prompt_formatter.py
deleted file mode 100644
index 4ffdc21..0000000
--- a/flow_judge/utils/prompt_formatter.py
+++ /dev/null
@@ -1,132 +0,0 @@
-from typing import Any
-
-from flow_judge.metrics.metric import RubricItem
-
-USER_PROMPT_TEMPLATE = """# GOAL
-Your job is to evaluate a task carried out by an AI system powered by a large \
-language model.
-
-You will be provided with the inputs and output of the task, as well as the evaluation criteria \
-and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
-criteria and scoring rubric provided.
-
-# INPUT
-Below are the inputs required for performing the task:
-
-{INPUTS}
-
-
-# OUTPUT
-Below is the output of the task:
-
-
-# EVALUATION CRITERIA AND SCORING RUBRIC
-Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
-
-{EVALUATION_CRITERIA}
-
-
-
-{RUBRIC}
-
-
-# INSTRUCTIONS FOR THE EVALUATION
-1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
-Review the evaluation criteria and scoring rubric to understand the different levels of \
-performance and the descriptions for each score.
-2. Review the inputs and output: Look at the inputs provided for the task. Examine the output \
-generated from completing the task.
-3. Compare output to score descriptions: Compare the output against the criteria and score \
-descriptions in the scoring rubric. For each criterion,decide which description best matches the \
-output.
-4. After comparing the output to the score descriptions, pay attention to the small details that \
-might impact the final score that you assign. Sometimes a small difference can dictate the final \
-score.
-5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
-to specific aspects of the output and comparing them to the rubric.
-6. Assign a final score based on the scoring rubric.
-
-## FORMAT FOR THE EVALUATION
-- Write the verbal feedback inside tags without any additional surrounding text.
-- Write the numeric score inside tags, without any additional surrounding text and always \
-after the feedback.
-
-Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
-
-
-USER_PROMPT_NO_INPUTS_TEMPLATE = """# GOAL
-Your job is to evaluate a task carried out by an AI system powered by a large language model.
-
-You will be provided the output of the task, as well as the evaluation criteria \
-and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
-criteria and scoring rubric provided.
-
-# OUTPUT
-Below is the output of the task:
-
-
-# EVALUATION CRITERIA AND SCORING RUBRIC
-Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
-
-{EVALUATION_CRITERIA}
-
-
-
-{RUBRIC}
-
-
-# INSTRUCTIONS FOR THE EVALUATION
-1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
-Review the evaluation criteria and scoring rubric to understand the different levels of \
-performance and the descriptions for each score.
-2. Review the output: Examine the output generated from completing the task.
-3. Compare output to score descriptions: Compare the output against the criteria and score \
-descriptions in the scoring rubric. For each criterion,decide which description best matches the \
-output.
-4. After comparing the output to the score descriptions, pay attention to the small details that \
-might impact the final score that you assign. Sometimes a small difference can dictate the final \
-score.
-5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
-to specific aspects of the output and comparing them to the rubric.
-6. Assign a final score based on the scoring rubric.
-
-## FORMAT FOR THE EVALUATION
-- Write the verbal feedback inside tags without any additional surrounding text.
-- Write the numeric score inside tags, without any additional surrounding text and always \
-after the feedback.
-
-Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
-
-
-def format_vars(variables: list[dict[str, str]]) -> str:
- """Format variables for the prompt."""
- var_strs = []
- for var in variables:
- for key, value in var.items():
- var_tag = key.lower().replace(" ", "_")
- var_strs.append(f"<{var_tag}>\n{value}\n{var_tag}>")
- return "\n".join(var_strs)
-
-
-def format_rubric(rubric: list[RubricItem]) -> str:
- """Format the rubric for the prompt."""
- rubric_strs = []
-
- # Sort rubric items by score, lowest to highest
- sorted_rubric = sorted(rubric, key=lambda x: x.score)
-
- for item in sorted_rubric:
- rubric_strs.append(f"- Score {item.score}: {item.description}")
- return "\n".join(rubric_strs)
-
-
-def format_user_prompt(prompt_variables: dict[str, Any]) -> str:
- """Format the user prompt based on provided variables."""
- if prompt_variables["INPUTS"]:
- return USER_PROMPT_TEMPLATE.format(**prompt_variables)
- else:
- return USER_PROMPT_NO_INPUTS_TEMPLATE.format(**prompt_variables)
diff --git a/pyproject.toml b/pyproject.toml
index 1651dfa..83200b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,11 +62,18 @@ llamafile = [
"openai>=1.51.0",
]
+[project.scripts]
+flow-judge = "flow_judge.utils.cli:cli"
+
[project.urls]
Homepage = "https://github.com/flowaicom/flow-judge"
[tool.setuptools]
packages = ["flow_judge", "flow_judge.integrations", "flow_judge.metrics", "flow_judge.models", "flow_judge.utils"]
+include-package-data = true
+
+[tool.setuptools.package-data]
+"flow_judge.metrics" = ["_data/**/*.yaml", "_data/**/*.j2"]
[tool.setuptools_scm]
version_scheme = "python-simplified-semver"
diff --git a/tests/test_rubric_functionality.py b/tests/test_rubric_functionality.py
new file mode 100644
index 0000000..9f48e0c
--- /dev/null
+++ b/tests/test_rubric_functionality.py
@@ -0,0 +1,144 @@
+import os
+from unittest.mock import create_autospec, patch
+
+import pytest
+from click.testing import CliRunner
+
+from flow_judge.flow_judge import FlowJudge
+from flow_judge.models import Vllm
+from flow_judge.utils.cli import cli
+from flow_judge.utils.notebook import (
+ create_judge_from_yaml,
+ create_metric_from_template,
+ display_rubric_request_form,
+ load_rubric_templates,
+ request_rubric,
+)
+
+# Path to the rubrics directory
+RUBRICS_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "rubrics")
+
+
+@pytest.fixture
+def rubric_templates():
+ """Fixture for rubric templates.
+
+ This fixture loads the rubric templates from the RUBRICS_DIR and returns them.
+ """
+ return load_rubric_templates(RUBRICS_DIR)
+
+
+def test_load_rubric_templates(rubric_templates):
+ """Test the load_rubric_templates function.
+
+ This test ensures that rubric templates are loaded correctly from the RUBRICS_DIR,
+ and checks for the existence of specific rubrics.
+ """
+ assert len(rubric_templates) > 0
+ # Check for the existence of specific rubrics
+ assert "article_evaluation/source_attribution" in rubric_templates
+ assert "query_decomposition/sub_query_coverage" in rubric_templates
+
+
+def test_create_metric_from_template(rubric_templates):
+ """Test the create_metric_from_template function.
+
+ This test verifies that a metric is correctly created from a template,
+ checking the name, criteria, and rubric length.
+ """
+ template = rubric_templates["article_evaluation/source_attribution"]
+ metric = create_metric_from_template(template)
+ assert metric.name == "article_source_attribution"
+ assert metric.criteria == template.criteria
+ assert len(metric.rubric) == len(template.rubric)
+
+
+def test_create_judge_from_yaml():
+ """Test the create_judge_from_yaml function.
+
+ This test ensures that a FlowJudge instance is correctly created from a YAML file,
+ using a mock Vllm model.
+ """
+ yaml_path = os.path.join(RUBRICS_DIR, "article_evaluation", "source_attribution.yaml")
+
+ # Create a mock Vllm instance that will pass isinstance checks
+ mock_vllm = create_autospec(Vllm, instance=True)
+
+ with patch("flow_judge.utils.rubrics.Vllm", return_value=mock_vllm):
+ judge = create_judge_from_yaml(yaml_path, mock_vllm)
+
+ # Assert that the FlowJudge is created correctly
+ assert isinstance(judge, FlowJudge)
+ assert judge.metric.name == "article_source_attribution"
+
+ # Assert that the created judge uses the mock Vllm instance
+ assert judge.model == mock_vllm
+
+
+@pytest.mark.skipif(not os.environ.get("JUPYTER_AVAILABLE"), reason="Requires Jupyter environment")
+def test_display_rubric_request_form():
+ """Test the display_rubric_request_form function.
+
+ This test verifies that the function calls IPython.display.display
+ when executed in a Jupyter environment.
+ """
+ with patch("IPython.display.display") as mock_display:
+ display_rubric_request_form()
+ mock_display.assert_called_once()
+
+
+@pytest.mark.parametrize(
+ "title,description,similar_to",
+ [
+ ("Test Rubric", "This is a test rubric request", "article_evaluation/source_attribution"),
+ ("Another Test", "Another description", None),
+ ],
+)
+def test_request_rubric(title, description, similar_to):
+ """Test the request_rubric function with various input parameters.
+
+ This test ensures that the function opens a web browser with the correct URL
+ for different combinations of title, description, and similar_to parameters.
+ """
+ with patch("webbrowser.open") as mock_open:
+ request_rubric(title, description, similar_to)
+ mock_open.assert_called_once()
+
+
+def test_cli_command():
+ """Test the CLI command for requesting a rubric.
+
+ This test verifies that the CLI command correctly calls the request_rubric function
+ with the provided arguments.
+ """
+ runner = CliRunner()
+ with patch("flow_judge.utils.cli.request_rubric") as mock_request:
+ result = runner.invoke(
+ cli,
+ [
+ "request-rubric",
+ "--title",
+ "CLI Test Rubric",
+ "--description",
+ "This is a test rubric request from CLI",
+ "--similar-to",
+ "article_evaluation/source_attribution",
+ ],
+ )
+ assert result.exit_code == 0
+ mock_request.assert_called_once_with(
+ "CLI Test Rubric",
+ "This is a test rubric request from CLI",
+ "article_evaluation/source_attribution",
+ )
+
+
+# Optional: Add a test for is_notebook() function if it's part of your public API
+def test_is_notebook():
+ """Test the is_notebook function.
+
+ This test checks that the is_notebook function returns False when run in a pytest environment.
+ """
+ from flow_judge.utils.notebook import is_notebook
+
+ assert not is_notebook() # This should return False when run in pytest