forked from taisazero/socratic-debugging-benchmark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_socratic_benchmark_metrics.py
425 lines (380 loc) · 21.5 KB
/
run_socratic_benchmark_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
from metrics.metric_computer import MetricComputer
from data_utils.data_reader import XMLDataReader
from inference.gpt_inference import ChatGPTModel
from data_utils.data_reader import extract_tag_content
import argparse
from tqdm import tqdm
import os
import regex as re
def debug():
# metric = MetricComputer(export_to_excel=True, export_path="results.xlsx")
# predictions = ["hello there, I am badeed", "general kenobi kappacino frank"]
# references = [
# ["hello there, I am badeed", "hi here! you like cats?"],
# ["general kenobi kappacino frank", "general grievous badeed neice"]
# ]
# scores = metric.compute_overall_score(predictions, references)
# print(scores)
dialogue_text = """
User: I am trying to run the code but it is not working.
<alt> Nothing is working.
Assistant: What is the error message?
<alt> Any error message?
<alt> What does the terminal say?
User: I figured it out. I was missing a comma.
<code>
import pandas as pd
if True:
df = pd.read_csv('data.csv' , sep=',')
</code>
<alt> Ah! I think it is because the file name is wrong.
<code>
import pandas as pd
if True:
df = pd.read_csv('my_data.csv')
</code>
Assistant: Great! It is working now.
<alt> Awesome! Good job!
User: Thank you for your help.
<alt> Thanks!
"""
path = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(path, "reviewing_dialogues")
path = os.path.join(path, "final_dataset")
data_reader = XMLDataReader(path)
print(data_reader._parse_dialogue(dialogue_text))
def main(use_chat_prompt, generation_mode, num_responses, dataset_path, eval_method):
path = os.path.dirname(os.path.abspath(__file__))
# split dataset_path by / and replace with os.path.join
dataset_path = os.path.join(*dataset_path.split("/"))
path = os.path.join(path, dataset_path)
data_reader = XMLDataReader(path)
if generation_mode == "sample" and num_responses > 1:
n = num_responses
else:
n = 1
parsed_dialogues = data_reader.get_parsed_dialogues()
api_key = open(".streamlit/oai_key.txt", "r").read().strip()
steering_prompt = ""
if generation_mode != "sample":
generation_args = {
"max_tokens": 1024,
"temperature": 0.0,
"top_p": 0.0,
"frequency_penalty": 0,
"presence_penalty": 0,
"stop": None,
"n": n, # number of responses to return,
"stream": False,
}
else:
generation_args = {
"max_tokens": 1024,
"temperature": 0.75,
"top_p": 0.9,
"frequency_penalty": 0,
"presence_penalty": 0,
"stop": None,
"n": n, # number of responses to return,
"stream": False,
}
if generation_mode == "multiple":
steering_prompt = "You are a tutor that always responds in the Socratic style. You *never* give the student the answer, but always try to ask just the right question to help them learn to think for themselves. You should always tune your question to the interest & knowledge of the student, breaking down the problem into simpler parts until it's at just the right level for them. Socratic utterances are utterances that guide the user and do not give them the solution directly. In each of your responses, provide a comprehensive list of Socratic responses that you can give to the user to help them solve the problem on their own, based on the conversation so far."
elif generation_mode == "cot":
steering_prompt = "You are a reflective and experienced tutor. You always introspect and think about all the reasons causing the user to make their mistake. When asked to respond to the user you always respond in the Socratic style. You *never* give the student the answer, but always try to ask just the right question to help them learn to think for themselves. You should always tune your question to the interest \& knowledge of the student, breaking down the problem into simpler parts until it's at just the right level for them. Socratic utterances guide the user and do not give them the solution directly. You are as comprehensive as possible when listing reasons. You are also as comprehensive as possible when listing Socratic utterances guiding the user. Your responses should be in-line with the instruction you are given."
else:
steering_prompt = "You are a tutor that always responds in the Socratic style. You *never* give the student the answer, but always try to ask just the right question to help them learn to think for themselves. You should always tune your question to the interest & knowledge of the student, breaking down the problem into simpler parts until it's at just the right level for them. Socratic utterances are utterances that guide the user and do not give them the solution directly."
chatgpt = ChatGPTModel(
api_key=api_key,
generation_args=generation_args,
steering_prompt=steering_prompt,
)
gpt4 = ChatGPTModel(
model_name="gpt-4",
api_key=api_key,
generation_args=generation_args,
steering_prompt=steering_prompt,
)
# make sure the results folder exists
if not os.path.exists("results/gpt4"):
os.makedirs("results/gpt4")
if not os.path.exists("results/chatgpt"):
os.makedirs("results/chatgpt")
if use_chat_prompt:
chatgpt_path = (
f"results/chatgpt/chat_prompt_{generation_mode}_{n}_responses.xlsx"
)
gpt4_path = f"results/gpt4/chat_prompt_{generation_mode}_{n}_responses.xlsx"
else:
if generation_mode == "multiple" or generation_mode == "cot":
chatgpt_path = f"results/chatgpt/comprehensive_prompt_{generation_mode}_{n}_responses.xlsx"
gpt4_path = f"results/gpt4/comprehensive_prompt_{generation_mode}_{n}_responses.xlsx"
else:
chatgpt_path = f"results/chatgpt/instruction_prompt_{generation_mode}_{n}_responses.xlsx"
gpt4_path = (
f"results/gpt4/instruction_prompt_{generation_mode}_{n}_responses.xlsx"
)
chatgpt_metrics = MetricComputer(export_to_excel=True, export_path=chatgpt_path)
gpt4_metrics = MetricComputer(export_to_excel=True, export_path=gpt4_path)
chatgpt_responses = []
gpt4_responses = []
dataset_references = []
prompts = []
for parsed_dialogue_object in tqdm(parsed_dialogues, desc="Processing dialogues"):
dialogue = parsed_dialogue_object["dialogue"]
context_dict = parsed_dialogue_object["context"]
context_text = context_dict["context_text"]
if generation_mode == "multiple":
instruction = "Respond to the user with all possible distinct Socratic utterances that guide the user to discover and fix the bug described between `<bug_desc>` and `</bug_desc>`. Student code is written between `<code>` and `</code>` throughout the conversation. Utterances that have the same meaning but different words are considered duplicates. Assume that the student has run the test cases.\n1."
elif generation_mode == "cot":
instruction = {
"first_introspection": "After looking at the buggy code, the bug description, and the bug fix, what are all the possible reasons or misconceptions that led the user to make this mistake? Do NOT list Socratic questions.",
"introspection": 'Given the dialogue so far, what are all the possible reasons or misconceptions if any that the user still has that impede them from fixing the bug? Do NOT list Socratic questions. If the bug is already fixed, say "There are no remaining misconceptions or reasons that impede the user from fixing the bug, as they have already identified and corrected their code."',
"respond": "Utilizing the dialogue and reasons or misconceptions so far, respond to the user with all possible distinct Socratic utterances that guide the user to discover and fix the bug described between `<bug_desc>` and `</bug_desc>`. Student code is written between `<code>` and `</code>` throughout the conversation. Utterances that have the same meaning but different words are considered duplicates. Assume that the student has run the test cases.\n1.",
}
else:
instruction = "Respond to the user with a Socratic utterance that guides the user to discover and fix the bug described between `<bug_desc>` and `</bug_desc>`. Student code is written between `<code>` and `</code>` throughout the conversation. Assume that the student has run the test cases."
for i, turn in enumerate(dialogue):
prompt = context_text + "\n\n"
# dialogue is composed of a list of dictionaries, each dictionary is a turn
# each turn has the following keys:
# user_text: the text the user said
# code_text: the code if any the user has written in this turn
# cumulative_code_text: the most current code if any the user has written up to this point
# assistant_text: the text the assistant said
# user_alternatives: [{alt_text: the alternatives the user could have said, alt_code: any code the user could have written}]
# assistant_alternatives: the alternatives the assistant could have said.
# break if the turn has no assistant text
if i < len(dialogue) and dialogue[i]["assistant_text"] == "":
break
if use_chat_prompt:
if generation_mode == "multiple":
raise ValueError(
"Multiple generation mode is not supported with chat prompt"
)
elif generation_mode == "cot":
raise ValueError(
"COT generation mode is not supported with chat prompt"
)
else:
# Valid chat prompt generation mode
# loop through all previous turns up to this point and add them to a messages list of tuples containing (speaker, text).
# then use `generate_turn(messages, user_identifier='User', system_identifier='Assistant')'`
messages = []
# instantiate messages with the context text and the first turn
messages.append(
("User", context_text + "\n\n" + dialogue[0]["user_text"])
)
for j in range(1, i):
prev_turn = dialogue[j]
if prev_turn["code_text"] != "":
messages.append(
(
"User",
prev_turn["user_text"]
+ "\n<code>\n"
+ prev_turn["code_text"]
+ "\n</code>\n",
)
)
else:
messages.append(("User", prev_turn["code_text"]))
messages.append(("Assistant", prev_turn["assistant_text"]))
if prev_turn["code_text"] != "":
messages.append(("Assistant", prev_turn["code_text"]))
prompt += f"User: {prev_turn['user_text']}\n"
prompt += f"Assistant: {prev_turn['assistant_text']}\n"
if prev_turn["code_text"] != "":
prompt += f"<code>\n{prev_turn['code_text']}\n</code>\n"
# add the current turn to the messages list
if turn["code_text"] != "" and i > 0:
messages.append(
(
"User",
turn["user_text"]
+ "\n<code>\n"
+ turn["code_text"]
+ "\n</code>\n",
)
)
elif i > 0:
messages.append(("User", turn["user_text"]))
# finish the prompt with the current turn
prompt += f"User: {turn['user_text']}\n"
if turn["code_text"] != "":
prompt += f"<code>\n{turn['code_text']}\n</code>\n"
chatgpt_response = chatgpt.generate_turn(
messages, user_identifier="User", system_identifier="Assistant"
)
gpt4_response = gpt4.generate_turn(
messages, user_identifier="User", system_identifier="Assistant"
)
prompts.append(prompt)
else:
# if in instruction prompt mode, add the instruction to the prompt
# loop through all previous turns up to this point and add them to the prompt
if i == 0 and generation_mode == "cot":
introspecton_prompt = (
prompt + f"{instruction['first_introspection']}\n"
)
# generate responses
introspection_chatgpt = chatgpt.generate(introspecton_prompt)
introspection_gpt4 = gpt4.generate(introspecton_prompt)
for j in range(i):
if generation_mode == "cot" and j == 0:
prompt += "\nDialogue:\n"
prev_turn = dialogue[j]
prompt += f"User: {prev_turn['user_text']}\n"
prompt += f"Assistant: {prev_turn['assistant_text']}\n"
if prev_turn["code_text"] != "":
prompt += f"<code>\n{prev_turn['code_text']}\n</code>\n"
prompt += f"User: {turn['user_text']}\n"
if turn["code_text"] != "":
prompt += f"<code>\n{turn['code_text']}\n</code>\n"
if generation_mode == "cot":
if 1 > i and generation_mode == "cot":
introspecton_prompt = (
prompt + f"{instruction['introspection']}\n"
)
# generate responses
introspection_chatgpt = chatgpt.generate(introspecton_prompt)
introspection_gpt4 = gpt4.generate(introspecton_prompt)
gpt4_prompt = (
prompt
+ f"\nReasons or Misconceptions so far:\n{introspection_gpt4}\n{instruction['respond']}"
)
chatgpt_prompt = (
prompt
+ f"\nReasons or Misconceptions so far:\n{introspection_chatgpt}\n{instruction['respond']}"
)
save_prompt = (
prompt
+ f"\nReasons or Misconceptions so far:\n\tchatgpt:\n{introspection_chatgpt}\n\tgpt4:\n{introspection_gpt4}\n{instruction['respond']}"
)
# generate responses
chatgpt_response = chatgpt.generate(chatgpt_prompt)
gpt4_response = gpt4.generate(gpt4_prompt)
prompts.append(save_prompt)
else:
prompt += f"\n{instruction}"
# generate responses
chatgpt_response = chatgpt.generate(prompt)
gpt4_response = gpt4.generate(prompt)
prompts.append(prompt)
references = [turn["assistant_text"]] + turn["assistant_alternatives"]
dataset_references.append(references)
if generation_mode == "multiple" or generation_mode == "cot":
# parse the itemized list string response into a list of responses
if not chatgpt_response.startswith(
"1."
) and not chatgpt_response.startswith("1. "):
chatgpt_response = (
"1." + chatgpt_response
if chatgpt_response.startswith(" ")
else "1. " + chatgpt_response
)
if not gpt4_response.startswith("1.") and not gpt4_response.startswith(
"1. "
):
gpt4_response = (
"1." + gpt4_response
if gpt4_response.startswith(" ")
else "1. " + gpt4_response
)
# clean responses from any code blocks
if "<code>" in chatgpt_response:
remove_code = extract_tag_content(chatgpt_response, "code")
chatgpt_response = chatgpt_response.replace(
f"<code>\n{remove_code}\n</code>", ""
)
if "<code>" in gpt4_response:
remove_code = extract_tag_content(gpt4_response, "code")
gpt4_response = gpt4_response.replace(
f"<code>\n{remove_code}\n</code>", ""
)
# create a list of responses from the itemized list string response while removing the item numbers using regex
chatgpt_response = re.findall(r"\d+\.\s*(.*)", chatgpt_response)
gpt4_response = re.findall(r"\d+\.\s*(.*)", gpt4_response)
# Note: if generation mode is multiple or sample with n > 1, then we have multiple responses per sample.
chatgpt_responses.append(chatgpt_response)
gpt4_responses.append(gpt4_response)
# compute metrics for all generation modes.
if eval_method != "thoroughness":
chat_gpt_scores = chatgpt_metrics.compute(
chatgpt_responses, dataset_references, contexts=prompts
)
gpt4_scores = gpt4_metrics.compute(
gpt4_responses, dataset_references, contexts=prompts
)
else:
chat_gpt_scores = chatgpt_metrics.compute_thoroughness(
chatgpt_responses, dataset_references, contexts=prompts
)
gpt4_scores = gpt4_metrics.compute_thoroughness(
gpt4_responses, dataset_references, contexts=prompts
)
print("ChatGPT scores:")
print(chat_gpt_scores)
print("GPT-4 scores:")
print(gpt4_scores)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--debug", action="store_true", help="Run debug mode")
parser.add_argument(
"--do_chat_prompt",
action="store_true",
help="Run chat prompt mode where the prompt is the chat history between the user and chatgpt. Defaults to an instructional prompt 'Respond to the user with a Socratic utterance that guides the user to discover and fix the bug described in `<bug_desc>`. Student code is denoted by `<code>` throughout the conversation. Utterances that have the same meaning but different words are considered duplicates. Assume that the student has run the test cases.'",
)
# Add argument for generation mode. Either 'single', 'multiple', 'cot', or 'sample'
parser.add_argument(
"--generation_mode",
type=str,
default="single",
help="Generate a single response or multiple responses using instruction or sample k responses. Options are 'single', 'multiple', 'cot', or 'sample'. Defaults to 'single'. 'multiple' is incompatible with '--do_chat_prompt'. If 'sample', the number of responses to sample is specified by the --num_responses argument.",
)
parser.add_argument(
"--num_responses",
type=int,
default=5,
help="Number of responses to sample if --generation_mode is 'sample'. Defaults to 5.",
)
parser.add_argument(
"--dataset_path",
type=str,
default="reviewing_dialogues/eval_full_v2",
help="Path to the dataset to evaluate. Defaults to 'reviewing_dialogues/eval_full_v2'.",
)
# add argument for evaluation method. Either 'overall' or 'thoroughness'
parser.add_argument(
"--evaluation_method",
type=str,
default="thoroughness",
help="Evaluation method. Either 'overall' or 'thoroughness'. Defaults to 'thoroughness'. Thoroughness uses the bipartite matching algorithm to find the best match between each reference and the response. Overall uses scoring of the best matching (reference, pred) pair.",
)
args = parser.parse_args()
if args.debug:
debug()
else:
main(
args.do_chat_prompt,
args.generation_mode,
args.num_responses,
args.dataset_path,
args.evaluation_method,
)
# debug()
# run with:
# instruction mode with a single response
# python run_socratic_benchmark_metrics.py (done on dekstop)
# single response without instruction mode
# python run_socratic_benchmark_metrics.py --do_chat_prompt (ready to test)
# multiple responses (all possible socratic utterances) with instruction mode
# python run_socratic_benchmark_metrics.py --generation_mode multiple (ready to test)
# sample 5 responses with instruction mode
# python run_socratic_benchmark_metrics.py --generation_mode sample --num_responses 5 (ready to test)
# sample 10 responses with instruction mode
# python run_socratic_benchmark_metrics.py --generation_mode sample --num_responses 10
# sample 5 responses without instruction mode
# python run_socratic_benchmark_metrics.py --do_chat_prompt --generation_mode sample --num_responses 5
# sample 10 responses without instruction mode
# python run_socratic_benchmark_metrics.py --do_chat_prompt --generation_mode sample --num_responses 10