Future-House · jamesbraza · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/paperqa/agents/task.py b/paperqa/agents/task.py
@@ -10,7 +10,7 @@
 import logging
 import re
 from abc import ABC
-from collections.abc import Awaitable, Callable, Sequence
+from collections.abc import Awaitable, Callable, Mapping, Sequence
 from copy import deepcopy
 from enum import StrEnum
 from typing import TYPE_CHECKING, Any, Self, assert_never
@@ -41,7 +41,7 @@ class ComputeTrajectoryMetricsMixin:  # type: ignore[no-redef]
 from paperqa.litqa import (
     DEFAULT_EVAL_MODEL_NAME,
     DEFAULT_LABBENCH_HF_HUB_NAME,
-    DEFAULT_REWARD_DISTRIBUTION,
+    DEFAULT_REWARD_MAPPING,
     LitQAEvaluation,
     read_litqa_v2_from_hub,
 )
@@ -72,7 +72,7 @@ def __init__(
             Callable[[PQASession | str], Awaitable[LitQAEvaluation]] | None
         ) = None,
         sources: str | list[str] | None = None,
-        rewards: Sequence[float] = DEFAULT_REWARD_DISTRIBUTION,
+        rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING,
         evaluation_callback: Callable[[LitQAEvaluation], Awaitable] | None = None,
         **env_kwargs,
     ):
@@ -187,7 +187,7 @@ def __init__(
         self,
         base_query: QueryRequest | dict | None = None,
         base_docs: Docs | dict | None = None,
-        rewards: Sequence[float] = DEFAULT_REWARD_DISTRIBUTION,
+        rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING,
         eval_model: LLMModel | str = DEFAULT_EVAL_MODEL_NAME,
         **env_kwargs,
     ):
@@ -272,10 +272,14 @@ def compute_trajectory_metrics(
             "relevant_paper_count": relevant_paper_count,
             "evidence_count": evidence_count,
             "correct": [
-                int(t.steps[-1].reward == self._rewards[0]) for t in trajectories
+                int(t.steps[-1].reward == self._rewards["correct"])
+                for t in trajectories
             ],
             "correct_unsure": [
-                int(t.steps[-1].reward in {self._rewards[0], self._rewards[1]})
+                int(
+                    t.steps[-1].reward
+                    in {self._rewards["correct"], self._rewards["unsure"]}
+                )
                 for t in trajectories
             ],
         }

diff --git a/paperqa/litqa.py b/paperqa/litqa.py
@@ -4,9 +4,10 @@
 
 import random
 import re
+import string
 from ast import literal_eval
-from collections.abc import Awaitable, Callable, Sequence
-from enum import IntEnum
+from collections.abc import Awaitable, Callable, Mapping, Sequence
+from enum import StrEnum
 from typing import TYPE_CHECKING
 
 try:
@@ -29,22 +30,36 @@
 
 def make_mc_options(
     ideal: str,
-    distractors: str | list[str],
+    distractors: str | Sequence[str],
     unsure_option: str | None = UNSURE_OPTION,
     seed: int | None = None,
-) -> tuple[str, str, str | None]:
-    """Return string of options (as letters) and correct answer."""
+) -> tuple[str, str, str | None, list[str]]:
+    r"""
+    Return string of options (as letters) and correct answer.
+
+    Examples:
+        >>> text, ideal_answer, unsure_answer, distractor_answers = make_mc_options(
+        ...     ideal="1", distractors=["0", "2", "Dog"], seed=0
+        ... )
+        >>> text
+        'A) Dog\nB) 2\nC) 0\nD) Insufficient information to answer this question\nE) 1'
+        >>> ideal_answer
+        'E'
+        >>> unsure_answer
+        'D'
+        >>> distractor_answers
+        ['C', 'B', 'A']
+    """
     if isinstance(distractors, str):
         try:
             split_distractors = literal_eval(distractors)
             if not isinstance(split_distractors, list):
                 raise TypeError("Need split_distractors to be a list.")  # noqa: TRY301
         except (ValueError, SyntaxError, TypeError):
             split_distractors = [d.strip("'[ ]\"") for d in distractors.split(",")]
-        options: list[str] = split_distractors
-    else:
-        # We are going to modify options in-place, so copy it first
-        options = distractors.copy()
+        distractors = split_distractors
+    # We are going to modify options in-place, so copy the distractors
+    options = [*distractors]
 
     if ideal == "null":
         if not unsure_option:
@@ -61,34 +76,41 @@ def make_mc_options(
     if unsure_option:
         options.append(unsure_option)
 
+    if len(options) > len(string.ascii_lowercase):
+        raise NotImplementedError(
+            "Didn't handle more multiple choice options than letters, options were"
+            f" {options}."
+        )
     random.Random(seed).shuffle(options)
     return (
         "\n".join([f"{_CAPITAL_A_INDEX + i:c}) {o}" for i, o in enumerate(options)]),
         chr(_CAPITAL_A_INDEX + options.index(correct_answer)),
         chr(_CAPITAL_A_INDEX + options.index(unsure_option)) if unsure_option else None,
+        [chr(_CAPITAL_A_INDEX + options.index(dstr)) for dstr in distractors],
     )
 
 
 DEFAULT_EVAL_MODEL_NAME = "gpt-4-turbo-2024-04-09"
-DEFAULT_REWARD_DISTRIBUTION: tuple[float, float, float] = 1.0, 0.1, -1.0
+DEFAULT_REWARD_MAPPING = {"correct": 1.0, "unsure": 0.1, "incorrect": -1.0}
 
 
-class LitQAEvaluation(IntEnum):
+class LitQAEvaluation(StrEnum):
     """Possible evaluation results for a LitQA question."""
 
-    CORRECT = 0
-    INCORRECT = 1
-    UNSURE = 2
+    CORRECT = "correct"
+    INCORRECT = "incorrect"
+    UNSURE = "unsure"
 
     def make_discounted_returns(
         self,
         num_steps: int,
-        rewards: Sequence[float] = DEFAULT_REWARD_DISTRIBUTION,
+        rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING,
         discount: float = 1.0,
     ) -> list[float]:
         try:
             return discounted_returns(
-                [i * rewards[self.value] for i in range(num_steps, 0, -1)],
+                # paper-qa has no intermediary rewards
+                [0] * (num_steps - 1) + [rewards[self.value]],
                 terminated=[False] * (num_steps - 1) + [True],
                 discount=discount,
             )
@@ -100,7 +122,11 @@ def make_discounted_returns(
 
     @classmethod
     def from_answer(
-        cls, text: str, ideal_mc_answer: str, unsure_mc_answer: str | None = None
+        cls,
+        text: str,
+        ideal_mc_answer: str,
+        unsure_mc_answer: str | None = None,
+        total_options: int | None = None,
     ) -> LitQAEvaluation:
         """Compare text with a multiple choice answer or optionally an unsure answer."""
 
@@ -112,6 +138,14 @@ def extract_answer(answer: str) -> str:
             return answer.split()[0][0].upper()
 
         result = extract_answer(text)
+        if (
+            total_options is not None
+            and ord(result[0]) - _CAPITAL_A_INDEX + 1 > total_options
+        ):
+            # The result extracted was not in the options
+            return cls.INCORRECT
+        # From here, if we don't match either the ideal or the unsure multiple choice
+        # options then we declare the answer as incorrect.
         evaluation_result = cls.INCORRECT
         if unsure_mc_answer and result[0].lower() == unsure_mc_answer[0].lower():
             evaluation_result = cls.UNSURE
@@ -145,7 +179,7 @@ def from_question(
             Two-tuple of created LitQA question, function (that can be thought of as
                 stateless) to use to extract an evaluation result from an answer.
         """
-        text, ideal_answer, unsure_answer = make_mc_options(
+        text, ideal_answer, unsure_answer, distractor_answers = make_mc_options(
             ideal=ideal,
             distractors=distractors,
             seed=seed,
@@ -180,6 +214,7 @@ async def llm_from_answer(answer: PQASession | str) -> LitQAEvaluation:
                 text=eval_chunk.text,
                 ideal_mc_answer=ideal_answer,
                 unsure_mc_answer=unsure_answer,
+                total_options=len(distractor_answers) + (2 if use_unsure else 1),
             )
 
         return qa_prompt, llm_from_answer

diff --git a/paperqa/prompts.py b/paperqa/prompts.py
@@ -99,8 +99,8 @@
 QA_PROMPT_TEMPLATE = "Q: {question}\n\nOptions:\n{options}"
 EVAL_PROMPT_TEMPLATE = (
     "Extract the single letter answer from the following question and answer"
-    "\n\n{qa_prompt}"
-    "\n\n{qa_answer}"
+    "\n\nQuestion: {qa_prompt}"
+    "\n\nAnswer: {qa_answer}"
     "\n\nSingle Letter Answer:"
 )
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -199,6 +199,7 @@ disable = [
     "broad-exception-caught",  # Don't care to enforce this
     "broad-exception-raised",  # Rely on ruff TRY002 for this
     "cyclic-import",  # Let Python blow up
+    "dangerous-default-value",  # Rely on ruff W0102 for this
     "expression-not-assigned",  # Rely on mypy func-returns-value for this
     "fixme",  # codetags are useful
     "function-redefined",  # Rely on mypy no-redef for this