Skip to content

Commit 5d87600

Browse files
authored
Bugfix
See comment: centerforaisafety/hle#8
1 parent a1e5737 commit 5d87600

File tree

1 file changed

+1
-2
lines changed

1 file changed

+1
-2
lines changed

hle_eval/run_judge_results.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ async def extract_answer(question, correct_answer, response):
4545
try:
4646
response = await client.beta.chat.completions.parse(
4747
model=args.judge,
48-
temperature=0.0,
4948
max_completion_tokens=4096, # overkill for judge
5049
messages=[
5150
{"role": "user", "content": prompt}
@@ -159,7 +158,7 @@ def dump_metrics(predictions, n):
159158
def main(args):
160159
assert args.num_workers > 1, "num_workers must be 2 or greater"
161160

162-
output_filepath = f"judged_{args.predictions}.json"
161+
output_filepath = f"judged_{os.path.basename(args.predictions)}.json"
163162
dataset = load_dataset(args.dataset, split="test").to_dict()
164163
# convert to list of json for async parallelism
165164
questions = [dict(zip(dataset.keys(), values)) for values in zip(*dataset.values())]

0 commit comments

Comments
 (0)