Generalize eval arch

RubricLab · Oct 9, 2024 · 1e3a72a · 1e3a72a
1 parent e2b4b91
commit 1e3a72a
Show file tree

Hide file tree

Showing 7 changed files with 115 additions and 91 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,4 @@
+- [2024-10-09] [Generalize eval arch](https://github.com/RubricLab/memory/commit/bf80487850e840525a1521925a439d7d9fc8d638)
 - [2024-10-09] [Add help cmd](https://github.com/RubricLab/memory/commit/9f35d0016dcd5d0f909cb77c2ea33ef70da60fb1)
 - [2024-10-09] [Scaffold evals](https://github.com/RubricLab/memory/commit/3801514a795881c74ea225d02eeae001a07ee57a)
 - [2024-10-04] [bleed for mono](https://github.com/RubricLab/memory/commit/6db7d39072c60714068bcb00b07bbf917d76b4b8)

diff --git a/evals/index.ts b/evals/index.ts
@@ -1,92 +1,6 @@
 import { parseArgs } from 'node:util'
-import { openai } from '@ai-sdk/openai'
-import { generateObject } from 'ai'
-import chalk from 'chalk'
-import { z } from 'zod'
-import { clean, format } from '../utils/string.ts'
-import { EXAMPLES } from './extractions.ts'
-
-export const main = async ({ fast }: { fast?: boolean }) => {
-	let totalExamples = 0
-	let totalRecall = 0
-	let totalEntities = 0
-
-	for await (const eg of EXAMPLES) {
-		let correctEntities = 0
-		let correctFacts = 0
-
-		totalExamples += eg.facts.length
-
-		const {
-			object: { facts: attempts }
-		} = await generateObject({
-			model: openai(fast ? 'gpt-4o-mini' : 'gpt-4o-2024-08-06'),
-			schema: z.object({
-				facts: z.array(
-					z.object({
-						subject: z.string(),
-						relation: z.string().describe('a verb phrase'),
-						object: z.string(),
-						data: z.record(z.string(), z.string()).optional().describe('to capture any additional info')
-					})
-				)
-			}),
-			messages: [
-				{
-					role: 'system',
-					content: clean`Please extract all probable and implicit facts from the following passage.
-            Portray the first-person as "user".
-            Capture new relationships.
-            Try to capture the most up-to-date state of affairs in present tense.
-            Passage:
-            "${eg.content}"`
-				},
-				{
-					role: 'user',
-					content: eg.content
-				}
-			]
-		})
-
-		const omitted: number[] = []
-
-		for (const fact of eg.facts) {
-			console.log(
-				`\nTarget: ${chalk.magenta(fact.subject)} ${chalk.yellow(fact.relation)} ${chalk.blue(fact.object)}`
-			)
-
-			for (const [index, attempt] of attempts.entries()) {
-				const { subject, relation, object } = attempt
-
-				const subjectMatch = fact.subject === subject
-				const relationMatch = fact.relation === relation
-				const objectMatch = fact.object === object
-
-				console.log(
-					`${index + 1} of ${attempts.length}: ${chalk.magenta(format(subject, subjectMatch))} ${chalk.yellow(
-						format(relation, relationMatch)
-					)} ${chalk.blue(format(object, objectMatch))}`
-				)
-
-				if (omitted.includes(index)) continue
-
-				correctEntities = Number(subjectMatch) + Number(relationMatch) + Number(objectMatch)
-				correctFacts += Number(subjectMatch && relationMatch && objectMatch)
-
-				if (correctEntities === 3) {
-					omitted.push(index)
-					break
-				}
-			}
-		}
-
-		totalRecall += correctFacts
-		totalEntities += correctEntities
-	}
-
-	console.log(`\nPrecision: ${chalk.green(`${~~((totalEntities / (totalExamples * 3)) * 100)}%`)}`)
-	console.log(`Recall: ${chalk.green(`${~~((totalRecall / totalExamples) * 100)}%`)}`)
-}
+import { runOneShotExamples } from './one-shot'
+import { runMultiTurnExamples } from './multi-turn'
 
 const args = parseArgs({
 	args: Bun.argv,
@@ -115,5 +29,6 @@ if (import.meta.path === Bun.main) {
 		process.exit(0)
 	}
 
-	main({ fast: args.values.fast })
+	await runOneShotExamples({ fast: args.values.fast })
+	await runMultiTurnExamples({ fast: args.values.fast })
 }
diff --git a/evals/multi-turn/examples.ts b/evals/multi-turn/examples.ts
@@ -0,0 +1 @@
+export const EXAMPLES = []
diff --git a/evals/multi-turn/index.ts b/evals/multi-turn/index.ts
@@ -0,0 +1,11 @@
+import { EXAMPLES } from './examples'
+
+export const runMultiTurnExamples = async ({
+	fast
+}: {
+	fast?: boolean
+}) => {
+	for await (const eg of EXAMPLES) {
+		console.log(eg)
+	}
+}
diff --git a/evals/extractions.ts → evals/one-shot/examples.ts b/evals/extractions.ts → evals/one-shot/examples.ts
@@ -48,7 +48,7 @@ export const EXAMPLES: Example[] = [
 		]
 	},
 	{
-		content: 'I am vegan... (2 hours later)... I am no longer vegan.',
+		content: 'I am vegan... (2 hours later)... I am not vegan.',
 		facts: [
 			{
 				subject: 'user',

diff --git a/evals/one-shot/index.ts b/evals/one-shot/index.ts
@@ -0,0 +1,96 @@
+import { openai } from '@ai-sdk/openai'
+import { generateObject } from 'ai'
+import chalk from 'chalk'
+import { z } from 'zod'
+import { clean, format } from '../../utils/string.ts'
+import { EXAMPLES } from './examples.ts'
+
+export const runOneShotExamples = async ({ fast }: { fast?: boolean }) => {
+	let totalFacts = 0
+	let totalRecall = 0
+	let totalAttempts = 0
+
+	for await (const eg of EXAMPLES) {
+		let correctFacts = 0
+
+		totalFacts += eg.facts.length
+
+		console.log(chalk.yellow(`\n\n"${eg.content}"`))
+
+		const {
+			object: { facts: attempts }
+		} = await generateObject({
+			model: openai(fast ? 'gpt-4o-mini' : 'gpt-4o-2024-08-06'),
+			schema: z.object({
+				facts: z.array(
+					z.object({
+						subject: z.string(),
+						relation: z.string().describe('a verb phrase'),
+						object: z.string(),
+						data: z.record(z.string(), z.string()).optional().describe('to capture any additional info')
+					})
+				)
+			}),
+			prompt: clean`Please extract all probable and implicit facts from the following passage.
+            Portray the first-person as "user".
+            Capture new relationships.
+            Try to capture the most up-to-date state of affairs in present tense.
+            Passage:
+            "${eg.content}"
+            `
+			// messages: [
+			// 	{
+			// 		role: 'system',
+			// 		content: clean`Please extract all probable and implicit facts from the following passage.
+			//       Portray the first-person as "user".
+			//       Capture new relationships.
+			//       Try to capture the most up-to-date state of affairs in present tense.`
+			// 	},
+			// 	{
+			// 		role: 'user',
+			// 		content: eg.content
+			// 	}
+			// ]
+		})
+
+		const omitted: number[] = []
+
+		for (const [i, fact] of eg.facts.entries()) {
+			console.log(
+				`\n🎯 ${i + 1} of ${eg.facts.length}: ${chalk.magenta(fact.subject)} ${chalk.yellow(fact.relation)} ${chalk.blue(fact.object)}`
+			)
+
+			for (const [j, attempt] of attempts.entries()) {
+				const { subject, relation, object } = attempt
+
+				const correctSubject = fact.subject === subject
+				const correctRelation = fact.relation === relation
+				const correctObject = fact.object === object
+
+				if (omitted.includes(j)) continue
+				console.log(
+					`🤖 ${j + 1} of ${attempts.length}: ${chalk.magenta(format(subject, correctSubject))} ${chalk.yellow(
+						format(relation, correctRelation)
+					)} ${chalk.blue(format(object, correctObject))}`
+				)
+
+				correctFacts += Number(correctSubject && correctRelation && correctObject)
+
+				if (correctFacts) {
+					omitted.push(j)
+					break
+				}
+			}
+		}
+
+		totalRecall += correctFacts
+		totalAttempts += attempts.length
+	}
+
+	console.log(
+		`\n\nPrecision (% of attempts true): ${totalRecall} of ${totalAttempts} ${chalk.green(`(${~~((totalRecall / totalAttempts) * 100)}%)`)}`
+	)
+	console.log(
+		`Recall (% of total facts correctly returned): ${totalRecall} of ${totalFacts} ${chalk.green(`(${~~((totalRecall / totalFacts) * 100)}%)`)}`
+	)
+}
diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@rubriclab/memory",
   "module": "index.ts",
-  "version": "0.0.5",
+  "version": "0.0.6",
   "private": false,
   "type": "module",
   "devDependencies": {