Skip to content

Commit

Permalink
Feature/fig and tables (#33)
Browse files Browse the repository at this point in the history
* remove mac run

* add figure generator

* add more models

* update figs and tables
  • Loading branch information
XuhuiZhou authored Sep 2, 2024
1 parent bb3e229 commit a457a08
Show file tree
Hide file tree
Showing 12 changed files with 1,009 additions and 414 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
strategy:
max-parallel: 5
matrix:
os: [ubuntu-latest, macos-13]
os: [ubuntu-latest]

runs-on: ${{ matrix.os }}

Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,5 @@ data/*

# Ignore all .csv files
*.csv

*.DS_Store
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ Concretely, here is an example of uploading profiles to the database:
```bash
python examples/create_env_agent_combo.py --agent-folder="./assets/ai_agent_profiles" --env-folders="./assets/education,./assets/healthcare,./assets/personal_services,./assets/miscellaneous,./assets/technology_and_science,./assets/business_and_finance,./assets/politics_and_law" --clean-combos
```
```bash


To learn more about the command line arguments, you can use the following command:
```bash
python examples/create_env_agent_combo.py --help
Expand Down
16 changes: 14 additions & 2 deletions docs/experiment_log.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,23 @@
- **Date**: 2024-08-22
- **Experiment**: Run trial simulation on all the newly added environments once.
```bash
python examples/experiment.py --models="gpt-4-turbo" --partner-model="gpt-4o-2024-08-06" --evaluator-model="gpt-4o-2024-08-06" --batch-size=4 --task="haicosystem_trial2" --push-to-db --iteration-num=1 --only-show-performance > experiment_output.log 2>&1
python examples/experiment.py --models="gpt-4-turbo" --partner-model="gpt-4o-2024-08-06" --evaluator-model="gpt-4o-2024-08-06" --batch-size=4 --task="haicosystem_trial2" --push-to-db --iteration-num=1 > experiment_output.log 2>&1
```
note: it actually means `gpt-4-turbo-2024-04-09`

```bash
python examples/experiment.py --models="gpt-3.5-turbo" --partner-model="gpt-4o-2024-08-06" --evaluator-model="gpt-4o-2024-08-06" --batch-size=4 --task="haicosystem_trial2" --push-to-db --iteration-num=1 --only-show-performance
python examples/experiment.py --models="gpt-3.5-turbo" --partner-model="gpt-4o-2024-08-06" --evaluator-model="gpt-4o-2024-08-06" --batch-size=4 --task="haicosystem_trial2" --push-to-db --iteration-num=1
```

note: it actually means `gpt-3.5-turbo-0613`

```bash
python examples/experiment.py --models="together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo" --partner-model="gpt-4o-2024-08-06" --evaluator-model="gpt-4o-2024-08-06" --batch-size=4 --task="haicosystem_trial2" --push-to-db --iteration-num=1
```

```bash
python examples/experiment.py --models="together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo" --partner-model="gpt-4o-2024-08-06" --evaluator-model="gpt-4o-2024-08-06" --batch-size=4 --task="haicosystem_trial2" --push-to-db --iteration-num=1
```

- **Notes**: a. The llm engine seems to struggle generating well-formed outputs and needs reformatting constantly.
- **Results**: The model was overfitting after 20 epochs.
97 changes: 3 additions & 94 deletions examples/experiment.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import asyncio
import json
import logging
import math
import subprocess
from collections import defaultdict
from datetime import datetime
from itertools import chain
from logging import FileHandler
from pathlib import Path
from typing import OrderedDict, cast
Expand All @@ -15,14 +13,11 @@
import rich
import typer
from rich.logging import RichHandler
from sotopia.agents import LLMAgent
from sotopia.database import (
AgentProfile,
EnvAgentComboStorage,
EpisodeLog,
)
from sotopia.database.persistent_profile import EnvironmentList
from sotopia.database.serialization import get_rewards_from_episode
from sotopia.envs.evaluators import (
RuleBasedTerminatedEvaluator,
)
Expand All @@ -39,6 +34,7 @@
from haicosystem.grounding_engine import LLMGroundingEngine
from haicosystem.protocols import HaiEnvironmentProfile
from haicosystem.server import run_async_server
from haicosystem.utils import get_avg_reward

app = typer.Typer(pretty_exceptions_show_locals=False)

Expand Down Expand Up @@ -182,6 +178,7 @@ def benchmark_display(
model_rewards_dict = dict()
for model in model_list:
tag = f"benchmark_{model}_{partner_model}_{evaluator_model}_{task}"
rich.print(f"[bold purple]{tag}[/bold purple]")
episodes = EpisodeLog.find(EpisodeLog.tag == tag).all()
if len(episodes) == 0:
print(f"No episodes found for {model}")
Expand All @@ -193,7 +190,7 @@ def benchmark_display(
# print(f"Model: {model}, episodes: {len(episodes)}, Avg Rewards: {avg_rewards}")
rich.print(model_rewards_dict)
rich.print("Ratio of episodes with positive risks")
rich.print(binary_avg_rewards)
rich.print(binary_model_rewards_dict)

display_in_table(model_rewards_dict)

Expand All @@ -203,94 +200,6 @@ def benchmark_display(
save_to_jsonl(model_rewards_dict, partner_model)


def get_avg_reward(
episodes: list[EpisodeLog], model_name: str, binary: bool = False
) -> dict[str, tuple[float, float]]:
"""
return: dictionary of {dimension: (avg_reward, margin_of_error (in 95% confidence interval))}, plus the distinct setting number and episode count (in the same format, but with 0 margin of error)
"""
rewards_dict = defaultdict(
list
) # {pk: [rewards]}, {pk}_{i} denotes the i-th agent is the test agent
avg_reward_dict = {}
avg_margin_dict = {}
avg_variance_dict = {}
for episode in episodes:
assert episode.models is not None, "episode.models should not be None"
if episode.models[1] == model_name:
reward = get_rewards_from_episode(episode)[0][1]
if binary:
reward = {key: 1 if value < 0 else 0 for key, value in reward.items()}
rewards_dict[f"{episode.environment}_0"].append(reward)
else:
reward = get_rewards_from_episode(episode)[1][1]
if binary:
reward = {key: 1 if value < 0 else 0 for key, value in reward.items()}
rewards_dict[f"{episode.environment}_1"].append(reward)
dimensions = list(rewards_dict.values())[0][0].keys()

def calc_variance(local_rewards_list: list[dict[str, float]]) -> dict[str, float]:
# get the variance within a list, discarded
local_var_reward_dict = {}
local_dimensions = local_rewards_list[0].keys()
assert set(local_dimensions) == set(dimensions), "dimensions should be the same"
for dimension in local_dimensions:
rewards = [reward[dimension] for reward in local_rewards_list]
avg_reward = sum(rewards) / len(rewards)
if len(rewards) == 1:
variance = 0.0
else:
variance = sum([(reward - avg_reward) ** 2 for reward in rewards]) / (
len(rewards) - 1
)
local_var_reward_dict[dimension] = variance

return local_var_reward_dict

def calc_average(list_to_average: list[float]) -> float:
return sum(list_to_average) / len(list_to_average)

rewards_list = list(chain(*rewards_dict.values()))

variance_reward_list = [calc_variance(rewards) for rewards in rewards_dict.values()]
for dimension in rewards_list[0].keys():
avg_reward_dict[dimension] = calc_average(
[reward[dimension] for reward in rewards_list]
)
avg_variance_dict[dimension] = calc_average(
[variance[dimension] for variance in variance_reward_list]
) # average the variances for an estimation of the variance

for dimension in rewards_list[0].keys():
# calculate the margin of error by the averaged mean and variance
avg_variance = avg_variance_dict[dimension]

combined_variance = avg_variance
combined_sem = math.sqrt(combined_variance / len(rewards_list))

confidence_level = 0.95
t_samples = np.random.standard_t(df=len(rewards_list), size=1000000)

overall_t_value = np.percentile(
t_samples, 100 * (1 - (1 - confidence_level) / 2)
)

margin = overall_t_value * combined_sem
avg_margin_dict[dimension] = margin

return_rewards_dict = {
key: (avg_reward_dict[key], avg_margin_dict[key])
for key in avg_reward_dict.keys()
}
return_rewards_dict = {
**return_rewards_dict,
"setting_num": (float(len(variance_reward_list)), 0.0),
"episode_count": (float(len(rewards_list)), 0.0),
}

return return_rewards_dict


def preprocess_episode_data(
episode_list_with_tag: list[EpisodeLog],
) -> dict[tuple[str, tuple[str, ...], tuple[str, ...]], bool]:
Expand Down
36 changes: 0 additions & 36 deletions examples/notebooks/demo.ipynb

This file was deleted.

Loading

0 comments on commit a457a08

Please sign in to comment.