Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
eavanvalkenburg committed Feb 17, 2025
1 parent 95c5229 commit 6e9a82b
Show file tree
Hide file tree
Showing 16 changed files with 68 additions and 78 deletions.
5 changes: 2 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,9 @@ dapr = [
"dapr-ext-fastapi>=1.14.0",
"flask-dapr>=1.14.0"
]
openai_realtime = [
"openai[realtime] ~= 1.0",
realtime = [
"websockets >= 13, < 15",
"aiortc>=1.9.0",
"sounddevice>=0.5.1",
]

[tool.uv]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.contents.events import RealtimeTextEvent
from semantic_kernel.contents.events import RealtimeAudioEvent, RealtimeTextEvent

logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
Expand All @@ -21,12 +21,11 @@
# This simple sample demonstrates how to use the OpenAI Realtime API to create
# a chat bot that can listen and respond directly through audio.
# It requires installing:
# - semantic-kernel[openai_realtime]
# - semantic-kernel[realtime]
# - pyaudio
# - sounddevice
# - pydub
# - aiortc
# e.g. pip install pyaudio sounddevice pydub
# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]

# The characterics of your speaker and microphone are a big factor in a smooth conversation
# so you may need to try out different devices for each.
Expand All @@ -41,11 +40,8 @@ async def main() -> None:
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = AzureRealtime("websocket")
audio_player = AudioPlayerWebsocket()
realtime_client = AzureRealtime(
"websocket",
audio_output_callback=audio_player.client_callback,
)
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
# Create the settings for the session
settings = OpenAIRealtimeExecutionSettings(
Expand All @@ -64,10 +60,10 @@ async def main() -> None:
async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True):
async for event in realtime_client.receive():
match event:
# this can be used as an alternative to the callback function used above,
# this can be used as an alternative to the callback function used in other samples,
# the callback is faster and smoother
# case RealtimeAudioEvent():
# await audio_player.add_audio(event.audio)
case RealtimeAudioEvent():
await audio_player.add_audio(event.audio)
case RealtimeTextEvent():
print(event.text.text, end="")
case _:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@
# This simple sample demonstrates how to use the OpenAI Realtime API to create
# a chat bot that can listen and respond directly through audio.
# It requires installing:
# - semantic-kernel[openai_realtime]
# - semantic-kernel[realtime]
# - pyaudio
# - sounddevice
# - pydub
# - aiortc
# e.g. pip install pyaudio sounddevice pydub
# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]

# The characterics of your speaker and microphone are a big factor in a smooth conversation
# so you may need to try out different devices for each.
Expand Down Expand Up @@ -67,9 +66,7 @@ async def main() -> None:
# the context manager calls the create_session method on the client and start listening to the audio stream
async with audio_player, realtime_client(settings=settings, create_response=True):
async for event in realtime_client.receive():
match event:
# case RealtimeAudioEvent():
# await audio_player.add_audio(event.audio)
match event.service_type:
case RealtimeTextEvent():
print(event.text.text, end="")
case _:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,11 @@
# This simple sample demonstrates how to use the OpenAI Realtime API to create
# a chat bot that can listen and respond directly through audio.
# It requires installing:
# - semantic-kernel[openai_realtime]
# - semantic-kernel[realtime]
# - pyaudio
# - sounddevice
# - pydub
# - aiortc
# e.g. pip install pyaudio sounddevice pydub
# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]


@kernel_function
Expand Down Expand Up @@ -64,8 +63,9 @@ async def main() -> None:
# create the audio player and audio track
# both take a device_id parameter, which is the index of the device to use, if None the default device is used
audio_player = AudioPlayerWebsocket()
# create the realtime client and optionally add the audio output function, this is optional
# create the realtime client and add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# (at this time Azure only support websockets)
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = AzureRealtime(
protocol="websocket",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,11 @@
# This simple sample demonstrates how to use the OpenAI Realtime API to create
# a chat bot that can listen and respond directly through audio.
# It requires installing:
# - semantic-kernel[openai_realtime]
# - semantic-kernel[realtime]
# - pyaudio
# - sounddevice
# - pydub
# - aiortc
# e.g. pip install pyaudio sounddevice pydub
# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]

# The characterics of your speaker and microphone are a big factor in a smooth conversation
# so you may need to try out different devices for each.
Expand Down
17 changes: 10 additions & 7 deletions python/samples/demos/call_automation/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# "Quart",
# "azure-eventgrid",
# "azure-communication-callautomation==1.4.0b1",
# "semantic-kernel[openai_realtime]",
# "semantic-kernel[realtime]",
# ]
#
# [tool.uv.sources]
Expand Down Expand Up @@ -42,16 +42,16 @@
from quart import Quart, Response, json, request, websocket

from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
InputAudioTranscription,
ListenEvents,
OpenAIRealtime,
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime
from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents
from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents import AudioContent
from semantic_kernel.contents.events import RealtimeAudioEvent
from semantic_kernel.functions import kernel_function

Expand Down Expand Up @@ -94,7 +94,7 @@ async def goodbye(self):

kernel.add_plugin(plugin=HelperPlugin(), plugin_name="helpers", description="Helper functions for the realtime client.")

# region: handlers for audio and data streams
# region: Handlers for audio and data streams


async def from_realtime_to_acs(audio: ndarray):
Expand Down Expand Up @@ -155,7 +155,7 @@ async def handle_realtime_messages(client: RealtimeClientBase):
print(f" AI:-- {event.service_event.transcript}")


# region: Quart routes
# region: Routes


# WebSocket.
Expand Down Expand Up @@ -285,6 +285,9 @@ def home():
return "Hello SKxACS CallAutomation!"


# region: Main


if __name__ == "__main__":
app.logger.setLevel(INFO)
app.run(port=8080)
6 changes: 5 additions & 1 deletion python/samples/demos/call_automation/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Create and activate python virtual environment and install required packages usi
```
pip install -r requirements.txt
```
Alternatively, if you have `uv` installed, you can ship this step.

### Setup and host your Azure DevTunnel

Expand All @@ -46,7 +47,10 @@ Copy the `.env.example` file to `.env` and update the following values:

## Run app locally

1. Navigate to `call_automation` folder and run `main.py` in debug mode or use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal
1. Navigate to `call_automation` folder and do one of the following to start the main application:
- run `main.py` in debug from your IDE
- use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal.
- execute `./main.py` directly (this uses `uv`, which will then install the requirements in a temporary virtual environment).
2. Browser should pop up with the below page. If not navigate it to `http://localhost:8080/`or your dev tunnel url.
3. Register an EventGrid Webhook for the IncomingCall(`https://<devtunnelurl>/api/incomingCall`) event that points to your devtunnel URI. Instructions [here](https://learn.microsoft.com/en-us/azure/communication-services/concepts/call-automation/incoming-call-notification).

Expand Down
1 change: 0 additions & 1 deletion python/samples/demos/call_automation/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
Quart>=0.19.6
azure-eventgrid==4.11.0
aiohttp>= 3.11.9
azure-communication-callautomation==1.4.0b1
semantic-kernel
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from semantic_kernel.contents.events.realtime_event import (
RealtimeAudioEvent,
RealtimeEvent,
RealtimeEvents,
RealtimeFunctionCallEvent,
RealtimeFunctionResultEvent,
RealtimeTextEvent,
Expand Down Expand Up @@ -69,7 +70,7 @@ class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase):
_current_settings: PromptExecutionSettings | None = PrivateAttr(default=None)
_call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict)

async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvent, None]:
async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvents, None]:
"""Handle all events but audio delta.
Audio delta has to be handled by the implementation of the protocol as some
Expand Down Expand Up @@ -194,7 +195,7 @@ async def update_session(
async def _parse_function_call_arguments_done(
self,
event: ResponseFunctionCallArgumentsDoneEvent,
) -> AsyncGenerator[RealtimeEvent | None]:
) -> AsyncGenerator[RealtimeEvents | None]:
"""Handle response function call done.
This always yields at least 1 event, either a RealtimeEvent or a RealtimeFunctionResultEvent with the raw event.
Expand Down Expand Up @@ -250,7 +251,7 @@ async def _send(self, event: RealtimeClientEvent) -> None:
raise NotImplementedError

@override
async def send(self, event: RealtimeEvent, **kwargs: Any) -> None:
async def send(self, event: RealtimeEvents, **kwargs: Any) -> None:
match event:
case RealtimeAudioEvent():
await self._send(
Expand Down Expand Up @@ -455,7 +456,7 @@ async def create_session(
pass

@override
def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvent, None]:
def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvents, None]:
pass

@override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@
from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents
from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents.events import RealtimeEvent
from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent
from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvents
from semantic_kernel.utils.experimental_decorator import experimental_class

if TYPE_CHECKING:
Expand All @@ -51,13 +50,13 @@ class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase):
peer_connection: RTCPeerConnection | None = None
data_channel: RTCDataChannel | None = None
audio_track: MediaStreamTrack | None = None
_receive_buffer: asyncio.Queue[RealtimeEvent] = PrivateAttr(default_factory=asyncio.Queue)
_receive_buffer: asyncio.Queue[RealtimeEvents] = PrivateAttr(default_factory=asyncio.Queue)

@override
async def receive(
self,
**kwargs: Any,
) -> AsyncGenerator[RealtimeEvent, None]:
) -> AsyncGenerator[RealtimeEvents, None]:
while True:
event = await self._receive_buffer.get()
yield event
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents
from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvent
from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvents
from semantic_kernel.utils.experimental_decorator import experimental_class

if TYPE_CHECKING:
Expand All @@ -42,7 +42,7 @@ class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase):
async def receive(
self,
**kwargs: Any,
) -> AsyncGenerator[RealtimeEvent, None]:
) -> AsyncGenerator[RealtimeEvents, None]:
await self.connected.wait()
if not self.connection:
raise ValueError("Connection is not established.")
Expand Down
9 changes: 4 additions & 5 deletions python/semantic_kernel/connectors/ai/realtime_client_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,19 @@
from collections.abc import AsyncGenerator, Callable, Coroutine
from typing import Any, ClassVar

from pydantic import PrivateAttr

if sys.version_info >= (3, 11):
from typing import Self # pragma: no cover
else:
from typing_extensions import Self # pragma: no cover

from numpy import ndarray
from pydantic import PrivateAttr

from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration
from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.contents.chat_history import ChatHistory
from semantic_kernel.contents.events.realtime_event import RealtimeEvent
from semantic_kernel.contents.events.realtime_event import RealtimeEvents
from semantic_kernel.services.ai_service_client_base import AIServiceClientBase
from semantic_kernel.utils.experimental_decorator import experimental_class

Expand All @@ -34,7 +33,7 @@ class RealtimeClientBase(AIServiceClientBase, ABC):
_create_kwargs: dict[str, Any] | None = PrivateAttr(default=None)

@abstractmethod
async def send(self, event: RealtimeEvent) -> None:
async def send(self, event: RealtimeEvents) -> None:
"""Send an event to the service.
Args:
Expand All @@ -47,7 +46,7 @@ async def send(self, event: RealtimeEvent) -> None:
def receive(
self,
**kwargs: Any,
) -> AsyncGenerator[RealtimeEvent, None]:
) -> AsyncGenerator[RealtimeEvents, None]:
"""Starts listening for messages from the service, generates events.
Args:
Expand Down
5 changes: 0 additions & 5 deletions python/semantic_kernel/contents/audio_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,3 @@ def from_audio_file(cls: type[_T], path: str) -> _T:
def to_dict(self) -> dict[str, Any]:
"""Convert the instance to a dictionary."""
return {"type": "audio_url", "audio_url": {"uri": str(self)}}

@classmethod
def from_ndarray(cls: type[_T], data: ndarray, mime_type: str) -> _T:
"""Create an instance from an ndarray."""
return cls(data=data, mime_type=mime_type)
2 changes: 2 additions & 0 deletions python/semantic_kernel/contents/events/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from semantic_kernel.contents.events.realtime_event import (
RealtimeAudioEvent,
RealtimeEvent,
RealtimeEvents,
RealtimeFunctionCallEvent,
RealtimeFunctionResultEvent,
RealtimeImageEvent,
Expand All @@ -12,6 +13,7 @@
__all__ = [
"RealtimeAudioEvent",
"RealtimeEvent",
"RealtimeEvents",
"RealtimeFunctionCallEvent",
"RealtimeFunctionResultEvent",
"RealtimeImageEvent",
Expand Down
14 changes: 13 additions & 1 deletion python/semantic_kernel/contents/events/realtime_event.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) Microsoft. All rights reserved.

from typing import Any, ClassVar, Literal
from typing import Annotated, Any, ClassVar, Literal, Union

from pydantic import Field

Expand All @@ -11,6 +11,18 @@
from semantic_kernel.contents.text_content import TextContent
from semantic_kernel.kernel_pydantic import KernelBaseModel

RealtimeEvents = Annotated[
Union[
"RealtimeEvent",
"RealtimeAudioEvent",
"RealtimeTextEvent",
"RealtimeFunctionCallEvent",
"RealtimeFunctionResultEvent",
"RealtimeImageEvent",
],
Field(discriminator="event_type"),
]


class RealtimeEvent(KernelBaseModel):
"""Base class for all service events."""
Expand Down
Loading

0 comments on commit 6e9a82b

Please sign in to comment.