Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing LitQAEvaluation bugs: incorrect reward indices, not using LLM's native knowledge #708

Merged
merged 11 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions paperqa/agents/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import logging
import re
from abc import ABC
from collections.abc import Awaitable, Callable, Sequence
from collections.abc import Awaitable, Callable, Mapping, Sequence
from copy import deepcopy
from enum import StrEnum
from typing import TYPE_CHECKING, Any, Self, assert_never
Expand Down Expand Up @@ -41,7 +41,7 @@ class ComputeTrajectoryMetricsMixin: # type: ignore[no-redef]
from paperqa.litqa import (
DEFAULT_EVAL_MODEL_NAME,
DEFAULT_LABBENCH_HF_HUB_NAME,
DEFAULT_REWARD_DISTRIBUTION,
DEFAULT_REWARD_MAPPING,
LitQAEvaluation,
read_litqa_v2_from_hub,
)
Expand Down Expand Up @@ -72,7 +72,7 @@ def __init__(
Callable[[PQASession | str], Awaitable[LitQAEvaluation]] | None
) = None,
sources: str | list[str] | None = None,
rewards: Sequence[float] = DEFAULT_REWARD_DISTRIBUTION,
rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING,
evaluation_callback: Callable[[LitQAEvaluation], Awaitable] | None = None,
**env_kwargs,
):
Expand Down Expand Up @@ -187,7 +187,7 @@ def __init__(
self,
base_query: QueryRequest | dict | None = None,
base_docs: Docs | dict | None = None,
rewards: Sequence[float] = DEFAULT_REWARD_DISTRIBUTION,
rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING,
eval_model: LLMModel | str = DEFAULT_EVAL_MODEL_NAME,
**env_kwargs,
):
Expand Down Expand Up @@ -272,10 +272,14 @@ def compute_trajectory_metrics(
"relevant_paper_count": relevant_paper_count,
"evidence_count": evidence_count,
"correct": [
int(t.steps[-1].reward == self._rewards[0]) for t in trajectories
int(t.steps[-1].reward == self._rewards["correct"])
for t in trajectories
],
"correct_unsure": [
int(t.steps[-1].reward in {self._rewards[0], self._rewards[1]})
int(
t.steps[-1].reward
in {self._rewards["correct"], self._rewards["unsure"]}
)
for t in trajectories
],
}
Expand Down
71 changes: 53 additions & 18 deletions paperqa/litqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import random
import re
import string
from ast import literal_eval
from collections.abc import Awaitable, Callable, Sequence
from enum import IntEnum
from collections.abc import Awaitable, Callable, Mapping, Sequence
from enum import StrEnum
from typing import TYPE_CHECKING

try:
Expand All @@ -29,22 +30,36 @@

def make_mc_options(
ideal: str,
distractors: str | list[str],
distractors: str | Sequence[str],
unsure_option: str | None = UNSURE_OPTION,
seed: int | None = None,
) -> tuple[str, str, str | None]:
"""Return string of options (as letters) and correct answer."""
) -> tuple[str, str, str | None, list[str]]:
r"""
Return string of options (as letters) and correct answer.

Examples:
>>> text, ideal_answer, unsure_answer, distractor_answers = make_mc_options(
... ideal="1", distractors=["0", "2", "Dog"], seed=0
... )
>>> text
'A) Dog\nB) 2\nC) 0\nD) Insufficient information to answer this question\nE) 1'
>>> ideal_answer
'E'
>>> unsure_answer
'D'
>>> distractor_answers
['C', 'B', 'A']
"""
if isinstance(distractors, str):
try:
split_distractors = literal_eval(distractors)
if not isinstance(split_distractors, list):
raise TypeError("Need split_distractors to be a list.") # noqa: TRY301
except (ValueError, SyntaxError, TypeError):
split_distractors = [d.strip("'[ ]\"") for d in distractors.split(",")]
options: list[str] = split_distractors
else:
# We are going to modify options in-place, so copy it first
options = distractors.copy()
distractors = split_distractors
# We are going to modify options in-place, so copy the distractors
options = [*distractors]

if ideal == "null":
if not unsure_option:
Expand All @@ -61,34 +76,41 @@ def make_mc_options(
if unsure_option:
options.append(unsure_option)

if len(options) > len(string.ascii_lowercase):
raise NotImplementedError(
"Didn't handle more multiple choice options than letters, options were"
f" {options}."
)
random.Random(seed).shuffle(options)
return (
"\n".join([f"{_CAPITAL_A_INDEX + i:c}) {o}" for i, o in enumerate(options)]),
chr(_CAPITAL_A_INDEX + options.index(correct_answer)),
chr(_CAPITAL_A_INDEX + options.index(unsure_option)) if unsure_option else None,
[chr(_CAPITAL_A_INDEX + options.index(dstr)) for dstr in distractors],
)


DEFAULT_EVAL_MODEL_NAME = "gpt-4-turbo-2024-04-09"
DEFAULT_REWARD_DISTRIBUTION: tuple[float, float, float] = 1.0, 0.1, -1.0
DEFAULT_REWARD_MAPPING = {"correct": 1.0, "unsure": 0.1, "incorrect": -1.0}


class LitQAEvaluation(IntEnum):
class LitQAEvaluation(StrEnum):
"""Possible evaluation results for a LitQA question."""

CORRECT = 0
INCORRECT = 1
UNSURE = 2
CORRECT = "correct"
INCORRECT = "incorrect"
UNSURE = "unsure"

def make_discounted_returns(
self,
num_steps: int,
rewards: Sequence[float] = DEFAULT_REWARD_DISTRIBUTION,
rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING,
discount: float = 1.0,
) -> list[float]:
try:
return discounted_returns(
[i * rewards[self.value] for i in range(num_steps, 0, -1)],
# paper-qa has no intermediary rewards
[0] * (num_steps - 1) + [rewards[self.value]],
terminated=[False] * (num_steps - 1) + [True],
discount=discount,
)
Expand All @@ -100,7 +122,11 @@ def make_discounted_returns(

@classmethod
def from_answer(
cls, text: str, ideal_mc_answer: str, unsure_mc_answer: str | None = None
cls,
text: str,
ideal_mc_answer: str,
unsure_mc_answer: str | None = None,
total_options: int | None = None,
) -> LitQAEvaluation:
"""Compare text with a multiple choice answer or optionally an unsure answer."""

Expand All @@ -112,6 +138,14 @@ def extract_answer(answer: str) -> str:
return answer.split()[0][0].upper()

result = extract_answer(text)
if (
total_options is not None
and ord(result[0]) - _CAPITAL_A_INDEX + 1 > total_options
):
# The result extracted was not in the options
mskarlin marked this conversation as resolved.
Show resolved Hide resolved
return cls.INCORRECT
# From here, if we don't match either the ideal or the unsure multiple choice
# options then we declare the answer as incorrect.
evaluation_result = cls.INCORRECT
if unsure_mc_answer and result[0].lower() == unsure_mc_answer[0].lower():
evaluation_result = cls.UNSURE
Expand Down Expand Up @@ -145,7 +179,7 @@ def from_question(
Two-tuple of created LitQA question, function (that can be thought of as
stateless) to use to extract an evaluation result from an answer.
"""
text, ideal_answer, unsure_answer = make_mc_options(
text, ideal_answer, unsure_answer, distractor_answers = make_mc_options(
ideal=ideal,
distractors=distractors,
seed=seed,
Expand Down Expand Up @@ -180,6 +214,7 @@ async def llm_from_answer(answer: PQASession | str) -> LitQAEvaluation:
text=eval_chunk.text,
ideal_mc_answer=ideal_answer,
unsure_mc_answer=unsure_answer,
total_options=len(distractor_answers) + (2 if use_unsure else 1),
)

return qa_prompt, llm_from_answer
Expand Down
4 changes: 2 additions & 2 deletions paperqa/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@
QA_PROMPT_TEMPLATE = "Q: {question}\n\nOptions:\n{options}"
EVAL_PROMPT_TEMPLATE = (
"Extract the single letter answer from the following question and answer"
"\n\n{qa_prompt}"
"\n\n{qa_answer}"
"\n\nQuestion: {qa_prompt}"
"\n\nAnswer: {qa_answer}"
"\n\nSingle Letter Answer:"
Comment on lines 101 to 104
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest something like:

"Given the following question and a proposed answer to the question, return the single-letter choice that matches the proposed answer."
"\n\nQuestion: ..."
"\n\nProposed Answer: ..."

If I didn't know the context of this method, as a human, I'd find the original prompt unclear.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it's a nice suggestion, but making this change breaks two of our test cases. Let's save this for another PR

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incorporated into #724

)

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ disable = [
"broad-exception-caught", # Don't care to enforce this
"broad-exception-raised", # Rely on ruff TRY002 for this
"cyclic-import", # Let Python blow up
"dangerous-default-value", # Rely on ruff W0102 for this
"expression-not-assigned", # Rely on mypy func-returns-value for this
"fixme", # codetags are useful
"function-redefined", # Rely on mypy no-redef for this
Expand Down
Loading