From d71a86c04f5a25565f864bfeb892f1d1a139c6ce Mon Sep 17 00:00:00 2001
From: tedspare <ted.spare@gmail.com>
Date: Wed, 9 Oct 2024 15:12:47 -0400
Subject: [PATCH] Scaffold multi-turn evals

---
 CHANGELOG.md                     |  1 +
 package.json                     |  2 +-
 src/evals/index.ts               | 17 +++++--
 src/evals/multi-turn/examples.ts | 33 +++++++++++-
 src/evals/multi-turn/index.ts    | 86 +++++++++++++++++++++++++++++++-
 src/evals/one-shot/index.ts      |  6 +--
 tsconfig.json                    |  1 +
 7 files changed, 135 insertions(+), 11 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 310301a..9694c74 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,4 @@
+- [2024-10-09] [Scaffold multi-turn evals](https://github.com/RubricLab/memory/commit/ecb5531acef6a924b81684660150eeb71d93e704)
 - [2024-10-09] [Add TSConfig](https://github.com/RubricLab/memory/commit/ed521824cc492e46adff6d38a994e18cc08166b2)
 - [2024-10-09] [Extract memory to class](https://github.com/RubricLab/memory/commit/5e165608ffad822c5b77ee03f1dfc308dcb1787a)
 - [2024-10-09] [Fix precision calc](https://github.com/RubricLab/memory/commit/52fc41e151c47e276c37a24b3489ba414d032a0b)
diff --git a/package.json b/package.json
index 0760c27..cf69fe2 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@rubriclab/memory",
   "module": "src/index.ts",
-  "version": "0.0.9",
+  "version": "0.0.10",
   "private": false,
   "type": "module",
   "devDependencies": {
diff --git a/src/evals/index.ts b/src/evals/index.ts
index 2f705a1..19f7b9a 100644
--- a/src/evals/index.ts
+++ b/src/evals/index.ts
@@ -9,6 +9,11 @@ const args = parseArgs({
 			type: 'boolean',
 			default: false
 		},
+		dataset: {
+			type: 'string',
+			default: '1',
+			choices: ['1', '2']
+		},
 		help: {
 			type: 'boolean',
 			default: false
@@ -18,7 +23,8 @@ const args = parseArgs({
 })
 
 if (import.meta.path === Bun.main) {
-	if (args.values.help) {
+	const { help, fast, dataset } = args.values
+	if (help) {
 		console.log(`
   Usage: bun evals/index.ts [options]
 
@@ -29,8 +35,11 @@ if (import.meta.path === Bun.main) {
 		process.exit(0)
 	}
 
-	const model = args.values.fast ? 'gpt-4o-mini' : 'gpt-4o-2024-08-06'
+	const model = fast ? 'gpt-4o-mini' : 'gpt-4o-2024-08-06'
 
-	await runOneShotExamples({ model })
-	await runMultiTurnExamples({ model })
+	if (dataset === '1') {
+		await runOneShotExamples({ model })
+	} else if (dataset === '2') {
+		await runMultiTurnExamples({ model })
+	}
 }
diff --git a/src/evals/multi-turn/examples.ts b/src/evals/multi-turn/examples.ts
index e8fead5..ce0f358 100644
--- a/src/evals/multi-turn/examples.ts
+++ b/src/evals/multi-turn/examples.ts
@@ -1 +1,32 @@
-export const EXAMPLES = []
+import type { Fact } from '@/types'
+
+type Example = {
+	messages: { facts: Fact[]; content: string }[]
+}
+
+export const EXAMPLES: Example[] = [
+	{
+		messages: [
+			{
+				content: 'I am vegan',
+				facts: [
+					{
+						subject: 'user',
+						relation: 'is',
+						object: 'vegan'
+					}
+				]
+			},
+			{
+				content: 'I am not vegan',
+				facts: [
+					{
+						subject: 'user',
+						relation: 'is not',
+						object: 'vegan'
+					}
+				]
+			}
+		]
+	}
+]
diff --git a/src/evals/multi-turn/index.ts b/src/evals/multi-turn/index.ts
index f5d9cda..2bf6b79 100644
--- a/src/evals/multi-turn/index.ts
+++ b/src/evals/multi-turn/index.ts
@@ -1,11 +1,93 @@
+import { Database } from 'bun:sqlite'
+import { Memory } from '@/index'
+import type { Fact } from '@/types'
+import { format } from '@/utils/string'
 import type { openai } from '@ai-sdk/openai'
-import { Memory } from '../..'
+import chalk from 'chalk'
 import { EXAMPLES } from './examples'
 
+const db = new Database(':memory:', { create: true, strict: true })
+
+await db
+	.prepare(
+		'create table if not exists facts (subject text, relation text, object text, primary key (subject, object))'
+	)
+	.get()
+
 export const runMultiTurnExamples = async ({ model }: { model: Parameters<typeof openai>[0] }) => {
 	const memory = new Memory({ model })
 
+	let totalFacts = 0
+	let totalRecall = 0
+	let totalAttempts = 0
+
 	for await (const eg of EXAMPLES) {
-		console.log(eg)
+		for await (const message of eg.messages) {
+			totalFacts += message.facts.length
+
+			console.log(chalk.yellow(`\n\n"${message.content}"`))
+
+			const { facts: attempts } = await memory.extract({
+				content: message.content
+			})
+
+			const omitted: number[] = []
+
+			for (const [i, fact] of message.facts.entries()) {
+				let correctFacts = 0
+
+				console.log(
+					`\n🎯 ${i + 1} of ${message.facts.length}: ${chalk.magenta(fact.subject)} ${chalk.yellow(fact.relation)} ${chalk.blue(fact.object)}`
+				)
+
+				for (const attempt of attempts) {
+					const { subject, relation, object } = attempt
+
+					db
+						.prepare(`
+							insert into facts (subject, relation, object)
+							values ($1, $2, $3)
+							on conflict (subject, object) do update set relation = $2
+						`)
+						.run(subject, relation, object)
+				}
+
+				const newFacts = db.query('select * from facts').all()
+				console.log({ newFacts })
+
+				for (const [k, newFact] of newFacts.entries()) {
+					const { subject, relation, object } = newFact as Fact
+
+					const correctSubject = fact.subject === subject
+					const correctRelation = fact.relation === relation
+					const correctObject = fact.object === object
+
+					console.log(
+						`🤖 ${k + 1} of ${newFacts.length}: ${chalk.magenta(format(subject, correctSubject))} ${chalk.yellow(
+							format(relation, correctRelation)
+						)} ${chalk.blue(format(object, correctObject))}`
+					)
+
+					if (omitted.includes(k)) continue
+
+					correctFacts += Number(correctSubject && correctRelation && correctObject)
+
+					if (correctFacts) {
+						omitted.push(k)
+						break
+					}
+				}
+				totalRecall += correctFacts
+			}
+
+			totalAttempts += attempts.length
+		}
 	}
+
+	console.log(
+		`\n\nPrecision (% of attempts true): ${totalRecall} of ${totalAttempts} ${chalk.green(`(${~~((totalRecall / totalAttempts) * 100)}%)`)}`
+	)
+	console.log(
+		`Recall (% of total facts correctly returned): ${totalRecall} of ${totalFacts} ${chalk.green(`(${~~((totalRecall / totalFacts) * 100)}%)`)}`
+	)
 }
diff --git a/src/evals/one-shot/index.ts b/src/evals/one-shot/index.ts
index 28d7249..3ade20c 100644
--- a/src/evals/one-shot/index.ts
+++ b/src/evals/one-shot/index.ts
@@ -1,16 +1,16 @@
-import { Memory } from '@/'
+import { Memory } from '@/index'
 import { format } from '@/utils/string'
 import type { openai } from '@ai-sdk/openai'
 import chalk from 'chalk'
 import { EXAMPLES } from './examples'
 
 export const runOneShotExamples = async ({ model }: { model: Parameters<typeof openai>[0] }) => {
+	const memory = new Memory({ model })
+
 	let totalFacts = 0
 	let totalRecall = 0
 	let totalAttempts = 0
 
-	const memory = new Memory({ model })
-
 	for await (const eg of EXAMPLES) {
 		totalFacts += eg.facts.length
 
diff --git a/tsconfig.json b/tsconfig.json
index 1330331..5d4cfb7 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -1,5 +1,6 @@
 {
 	"compilerOptions": {
+		"target": "ESNext",
 		"baseUrl": ".",
 		"paths": {
 			"@/*": ["./src/*"]