sotopia-lab · ProKil · Oct 4, 2024 · Oct 6, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
@@ -29,14 +29,16 @@ jobs:
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
+
+    - name: Install Pyaudio
+      run: sudo apt-get install -y portaudio19-dev
     - name: Display Python version
       run: python -c "import sys; print(sys.version)"
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         python -m pip install uv
-        uv sync --extra test --extra chat
     - name: Type-checking package with mypy
       run: |
         # Run this mypy instance against our main package.
-        uv run mypy --strict .
+        uv run --all-extras mypy --strict .
diff --git a/docs/pages/contribution/contribution.md b/docs/pages/contribution/contribution.md
@@ -133,7 +133,7 @@ Please refer to [Dev Containers](https://containers.dev/supporting#editors) to s
 
 You can also set up the development environment without Dev Containers. There are three things you will need to set up manually:
 
-- Python and uv: Please start from an environment supporting Python 3.10+ and install uv using `pip install uv; uv sync --all-extra`.
+- Python and uv: Please start from an environment supporting Python 3.10+ and install uv using `pip install uv; uv sync --all-extras`.
 - Redis: Please refer to introduction page for the set up of Redis.
 - Local LLM (optional): If you don't have access to model endpoints (e.g. OpenAI, Anthropic or others), you can use a local model. You can use Ollama, Llama.cpp,  vLLM or many others which support OpenAI compatible endpoints.
 

diff --git a/examples/experimental/realtime/audio_mixer.py b/examples/experimental/realtime/audio_mixer.py
@@ -0,0 +1,101 @@
+from typing import AsyncIterator, Literal
+from aact import Message, Node, NodeFactory
+from aact.messages import Tick, Audio
+import numpy as np
+
+
+def merge_audio_streams(
+    streams: list[bytes], sample_width: Literal[1, 2, 4] = 2
+) -> bytes:
+    # Convert byte streams to numpy arrays of audio samples
+    format_str = {1: "B", 2: "h", 4: "i"}[sample_width]
+    stream_samples = [
+        np.frombuffer(stream, dtype=np.dtype(format_str)) for stream in streams
+    ]
+
+    # Make sure both streams are the same length
+    stream_length = 0
+    for stream in stream_samples:
+        assert stream_length == 0 or len(stream) == stream_length
+        if not stream_length:
+            stream_length = len(stream)
+
+    # Mix audio by adding the samples and avoiding clipping
+    # mixed_samples = (stream1_samples.astype(np.int32) + stream2_samples.astype(np.int32)) // 2
+
+    mixed_samples = np.zeros(stream_length, dtype=np.int32)
+    for stream in stream_samples:
+        mixed_samples += stream
+    mixed_samples //= len(stream_samples)
+
+    # Clip the values to ensure they remain within valid range for the bit depth
+    mixed_samples = np.clip(
+        mixed_samples, np.iinfo(format_str).min, np.iinfo(format_str).max
+    )
+
+    # Convert back to byte stream
+    return mixed_samples.astype(np.dtype(format_str)).tobytes()
+
+
+@NodeFactory.register("audio_mixer")
+class AudioMixerNode(Node[Tick | Audio, Audio]):
+    def __init__(
+        self,
+        input_channels: list[str],
+        tick_input_channel: str,
+        output_channel: str,
+        redis_url: str,
+        buffer_size: int = 1024,
+    ):
+        super().__init__(
+            input_channel_types=[(channel, Audio) for channel in input_channels]
+            + [(tick_input_channel, Tick)],
+            output_channel_types=[(output_channel, Audio)],
+            redis_url=redis_url,
+        )
+        self.input_channels = input_channels
+        self.tick_input_channel = tick_input_channel
+        self.output_channel = output_channel
+        self.buffers: dict[str, bytes] = {channel: b"" for channel in input_channels}
+        self.overflow_buffers: dict[str, bytes] = {
+            channel: b"" for channel in input_channels
+        }
+        self.buffer_size = buffer_size
+
+    async def event_handler(
+        self, channel: str, message: Message[Tick | Audio]
+    ) -> AsyncIterator[tuple[str, Message[Audio]]]:
+        if channel == self.tick_input_channel:
+            output_buffers = []
+
+            for audio_channel in self.input_channels:
+                output_buffers.append(
+                    self.buffers[audio_channel]
+                    + b"\x00" * (self.buffer_size - len(self.buffers[audio_channel]))
+                )
+                self.buffers[audio_channel] = self.overflow_buffers[audio_channel][
+                    : self.buffer_size
+                ]
+                self.overflow_buffers[audio_channel] = self.overflow_buffers[
+                    audio_channel
+                ][self.buffer_size :]
+            output_buffer = merge_audio_streams(output_buffers)
+            yield self.output_channel, Message[Audio](data=Audio(audio=output_buffer))
+
+        elif channel in self.input_channels:
+            assert isinstance(message.data, Audio)
+            if len(self.buffers[channel]) == self.buffer_size:
+                self.overflow_buffers[channel] += message.data.audio
+            else:
+                self.buffers[channel] += message.data.audio
+                if len(self.buffers[channel]) >= self.buffer_size:
+                    self.overflow_buffers[channel] = self.buffers[channel][
+                        self.buffer_size :
+                    ]
+                    self.buffers[channel] = self.buffers[channel][: self.buffer_size]
+        else:
+            raise ValueError(f"Unexpected channel: {channel}")
+            yield (
+                self.output_channel,
+                Message(data=Audio(audio=b"")),
+            )  # Unreachable code
diff --git a/examples/experimental/realtime/input_node.py b/examples/experimental/realtime/input_node.py
@@ -0,0 +1,35 @@
+import sys
+from typing import AsyncIterator
+
+from sotopia.agents.llm_agent import ainput
+
+if sys.version_info < (3, 11):
+    pass
+else:
+    pass
+from aact import Message, Node, NodeFactory
+from aact.messages import Text, Zero
+
+
+@NodeFactory.register("input")
+class InputNode(Node[Zero, Text]):
+    def __init__(self, output_channel: str, redis_url: str) -> None:
+        super().__init__(
+            input_channel_types=[],
+            output_channel_types=[(output_channel, Text)],
+            redis_url=redis_url,
+        )
+        self.output_channel = output_channel
+
+    async def event_loop(self) -> None:
+        while True:
+            text = await ainput("Enter text: ")
+            await self.r.publish(
+                self.output_channel,
+                Message[Text](data=Text(text=text)).model_dump_json(),
+            )
+
+    async def event_handler(
+        self, _: str, __: Message[Zero]
+    ) -> AsyncIterator[tuple[str, Message[Text]]]:
+        yield self.output_channel, Message[Text](data=Text(text=""))
diff --git a/examples/experimental/realtime/readme.md b/examples/experimental/realtime/readme.md
@@ -0,0 +1,17 @@
+## Demo Realtime API
+
+You would need `portaudio` to run this demo:
+
+```bash
+# On Mac
+brew install portaudio
+
+# On Linux
+apt-get install portaudio19-dev
+```
+
+Execute this command in the repo folder to run the example:
+
+```python
+uv run --extra realtime aact run-dataflow examples/experimental/realtime/realtime_chat.toml
+```
diff --git a/examples/experimental/realtime/realtime_chat.toml b/examples/experimental/realtime/realtime_chat.toml
@@ -0,0 +1,74 @@
+redis_url = "redis://localhost:6379/0"
+extra_modules = [
+    "examples.experimental.realtime.realtime_websocket",
+    "examples.experimental.realtime.input_node",
+    "examples.experimental.realtime.audio_mixer"
+]
+
+[[nodes]]
+node_name = "speaker"
+node_class = "speaker"
+
+[nodes.node_args]
+rate = 24000
+input_channel = "Eve"
+
+[[nodes]]
+node_name = "speaker2"
+node_class = "speaker"
+
+[nodes.node_args]
+rate = 24000
+input_channel = "Jane"
+
+# [[nodes]]
+# node_name = "listener"
+# node_class = "listener"
+
+# [nodes.node_args]
+# rate = 24000
+# output_channel = "audio_input"
+
+
+[[nodes]]
+node_name = "Eve"
+node_class = "openai_realtime"
+
+[nodes.node_args]
+input_channel = "Jane_mixed"
+output_channel = "Eve"
+instruction = "Your name is Eve, you are talking to your friend Jane. You want to convince her to play poker with you tonight. Please start every sentence with \"Jane,\""
+
+
+[[nodes]]
+node_name = "Jane"
+node_class = "openai_realtime"
+
+[nodes.node_args]
+input_channel = "Eve_mixed"
+output_channel = "Jane"
+instruction = "Your name is Jane, you are talking to your friend Eve. You want to convince her to play soccer with you tonight. Please let him say the first sentence. Please start every sentence with \"Eve,\""
+
+[[nodes]]
+node_name = "audio_mixer_Jane"
+node_class = "audio_mixer"
+
+[nodes.node_args]
+input_channels = ["Jane"]
+tick_input_channel = "tick/millis/20"
+output_channel = "Jane_mixed"
+buffer_size = 960
+
+[[nodes]]
+node_name = "audio_mixer_Eve"
+node_class = "audio_mixer"
+
+[nodes.node_args]
+input_channels = ["Eve"]
+tick_input_channel = "tick/millis/20"
+output_channel = "Eve_mixed"
+buffer_size = 960
+
+[[nodes]]
+node_name = "tick"
+node_class = "tick"
diff --git a/examples/experimental/realtime/realtime_chat_human_in_the_loop.toml b/examples/experimental/realtime/realtime_chat_human_in_the_loop.toml
@@ -0,0 +1,74 @@
+redis_url = "redis://localhost:6379/0"
+extra_modules = [
+    "examples.experimental.realtime.realtime_websocket",
+    "examples.experimental.realtime.input_node",
+    "examples.experimental.realtime.audio_mixer"
+]
+
+[[nodes]]
+node_name = "speaker"
+node_class = "speaker"
+
+[nodes.node_args]
+rate = 24000
+input_channel = "Eve"
+
+[[nodes]]
+node_name = "speaker2"
+node_class = "speaker"
+
+[nodes.node_args]
+rate = 24000
+input_channel = "Jane"
+
+[[nodes]]
+node_name = "listener"
+node_class = "listener"
+
+[nodes.node_args]
+rate = 24000
+output_channel = "Jack"
+
+
+[[nodes]]
+node_name = "Eve"
+node_class = "openai_realtime"
+
+[nodes.node_args]
+input_channel = "Jane_mixed"
+output_channel = "Eve"
+instruction = "Your name is Eve, you are talking to your friend Jane and Jack. You want to convince them to play poker with you tonight. Please start every sentence with \"Jane,\" or \"Jack,\""
+
+
+[[nodes]]
+node_name = "Jane"
+node_class = "openai_realtime"
+
+[nodes.node_args]
+input_channel = "Eve_mixed"
+output_channel = "Jane"
+instruction = "Your name is Jane, you are talking to your friend Eve and Jack. You want to convince them to play soccer with you tonight. Please let him say the first sentence. Please start every sentence with \"Eve,\" or \"Jack,\""
+
+[[nodes]]
+node_name = "audio_mixer_Jane"
+node_class = "audio_mixer"
+
+[nodes.node_args]
+input_channels = ["Jane", "Jack"]
+tick_input_channel = "tick/millis/20"
+output_channel = "Jane_mixed"
+buffer_size = 960
+
+[[nodes]]
+node_name = "audio_mixer_Eve"
+node_class = "audio_mixer"
+
+[nodes.node_args]
+input_channels = ["Eve", "Jack"]
+tick_input_channel = "tick/millis/20"
+output_channel = "Eve_mixed"
+buffer_size = 960
+
+[[nodes]]
+node_name = "tick"
+node_class = "tick"