cleanup

microsoft · Feb 17, 2025 · 6e9a82b · 6e9a82b
1 parent 95c5229
commit 6e9a82b
Show file tree

Hide file tree

Showing 16 changed files with 68 additions and 78 deletions.
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -125,10 +125,9 @@ dapr = [
     "dapr-ext-fastapi>=1.14.0",
     "flask-dapr>=1.14.0"
 ]
-openai_realtime = [
-    "openai[realtime] ~= 1.0",
+realtime = [
+    "websockets >= 13, < 15",
     "aiortc>=1.9.0",
-    "sounddevice>=0.5.1",
 ]
 
 [tool.uv]

diff --git a/...altime/01-chat_with_realtime_websocket.py → ...ltime/01a-chat_with_realtime_websocket.py b/...altime/01-chat_with_realtime_websocket.py → ...ltime/01a-chat_with_realtime_websocket.py
@@ -10,7 +10,7 @@
     OpenAIRealtimeExecutionSettings,
     TurnDetection,
 )
-from semantic_kernel.contents.events import RealtimeTextEvent
+from semantic_kernel.contents.events import RealtimeAudioEvent, RealtimeTextEvent
 
 logging.basicConfig(level=logging.WARNING)
 utils_log = logging.getLogger("samples.concepts.realtime.utils")
@@ -21,12 +21,11 @@
 # This simple sample demonstrates how to use the OpenAI Realtime API to create
 # a chat bot that can listen and respond directly through audio.
 # It requires installing:
-# - semantic-kernel[openai_realtime]
+# - semantic-kernel[realtime]
 # - pyaudio
 # - sounddevice
 # - pydub
-# - aiortc
-# e.g. pip install pyaudio sounddevice pydub
+# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
 
 # The characterics of your speaker and microphone are a big factor in a smooth conversation
 # so you may need to try out different devices for each.
@@ -41,11 +40,8 @@ async def main() -> None:
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
+    realtime_client = AzureRealtime("websocket")
     audio_player = AudioPlayerWebsocket()
-    realtime_client = AzureRealtime(
-        "websocket",
-        audio_output_callback=audio_player.client_callback,
-    )
     audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
     # Create the settings for the session
     settings = OpenAIRealtimeExecutionSettings(
@@ -64,10 +60,10 @@ async def main() -> None:
     async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True):
         async for event in realtime_client.receive():
             match event:
-                # this can be used as an alternative to the callback function used above,
+                # this can be used as an alternative to the callback function used in other samples,
                 # the callback is faster and smoother
-                # case RealtimeAudioEvent():
-                #     await audio_player.add_audio(event.audio)
+                case RealtimeAudioEvent():
+                    await audio_player.add_audio(event.audio)
                 case RealtimeTextEvent():
                     print(event.text.text, end="")
                 case _:

diff --git a/.../realtime/01-chat_with_realtime_webrtc.py → ...realtime/01b-chat_with_realtime_webrtc.py b/.../realtime/01-chat_with_realtime_webrtc.py → ...realtime/01b-chat_with_realtime_webrtc.py
@@ -25,12 +25,11 @@
 # This simple sample demonstrates how to use the OpenAI Realtime API to create
 # a chat bot that can listen and respond directly through audio.
 # It requires installing:
-# - semantic-kernel[openai_realtime]
+# - semantic-kernel[realtime]
 # - pyaudio
 # - sounddevice
 # - pydub
-# - aiortc
-# e.g. pip install pyaudio sounddevice pydub
+# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
 
 # The characterics of your speaker and microphone are a big factor in a smooth conversation
 # so you may need to try out different devices for each.
@@ -67,9 +66,7 @@ async def main() -> None:
     # the context manager calls the create_session method on the client and start listening to the audio stream
     async with audio_player, realtime_client(settings=settings, create_response=True):
         async for event in realtime_client.receive():
-            match event:
-                # case RealtimeAudioEvent():
-                #     await audio_player.add_audio(event.audio)
+            match event.service_type:
                 case RealtimeTextEvent():
                     print(event.text.text, end="")
                 case _:

diff --git a/...me/02-azure_chat_with_function_calling.py → ...a-chat_with_function_calling_websocket.py b/...me/02-azure_chat_with_function_calling.py → ...a-chat_with_function_calling_websocket.py
@@ -24,12 +24,11 @@
 # This simple sample demonstrates how to use the OpenAI Realtime API to create
 # a chat bot that can listen and respond directly through audio.
 # It requires installing:
-# - semantic-kernel[openai_realtime]
+# - semantic-kernel[realtime]
 # - pyaudio
 # - sounddevice
 # - pydub
-# - aiortc
-# e.g. pip install pyaudio sounddevice pydub
+# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
 
 
 @kernel_function
@@ -64,8 +63,9 @@ async def main() -> None:
     # create the audio player and audio track
     # both take a device_id parameter, which is the index of the device to use, if None the default device is used
     audio_player = AudioPlayerWebsocket()
-    # create the realtime client and optionally add the audio output function, this is optional
+    # create the realtime client and add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
+    # (at this time Azure only support websockets)
     # they will behave the same way, even though the underlying protocol is quite different
     realtime_client = AzureRealtime(
         protocol="websocket",

diff --git a/...realtime/02-chat_with_function_calling.py → .../02b-chat_with_function_calling_webrtc.py b/...realtime/02-chat_with_function_calling.py → .../02b-chat_with_function_calling_webrtc.py
@@ -31,12 +31,11 @@
 # This simple sample demonstrates how to use the OpenAI Realtime API to create
 # a chat bot that can listen and respond directly through audio.
 # It requires installing:
-# - semantic-kernel[openai_realtime]
+# - semantic-kernel[realtime]
 # - pyaudio
 # - sounddevice
 # - pydub
-# - aiortc
-# e.g. pip install pyaudio sounddevice pydub
+# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
 
 # The characterics of your speaker and microphone are a big factor in a smooth conversation
 # so you may need to try out different devices for each.

diff --git a/python/samples/demos/call_automation/main.py b/python/samples/demos/call_automation/main.py
@@ -14,7 +14,7 @@
 #     "Quart",
 #     "azure-eventgrid",
 #     "azure-communication-callautomation==1.4.0b1",
-#     "semantic-kernel[openai_realtime]",
+#     "semantic-kernel[realtime]",
 # ]
 #
 # [tool.uv.sources]
@@ -42,16 +42,16 @@
 from quart import Quart, Response, json, request, websocket
 
 from semantic_kernel import Kernel
-from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
+from semantic_kernel.connectors.ai import FunctionChoiceBehavior
 from semantic_kernel.connectors.ai.open_ai import (
     InputAudioTranscription,
+    ListenEvents,
+    OpenAIRealtime,
     OpenAIRealtimeExecutionSettings,
     TurnDetection,
 )
-from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime
-from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents
 from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
-from semantic_kernel.contents.audio_content import AudioContent
+from semantic_kernel.contents import AudioContent
 from semantic_kernel.contents.events import RealtimeAudioEvent
 from semantic_kernel.functions import kernel_function
 
@@ -94,7 +94,7 @@ async def goodbye(self):
 
 kernel.add_plugin(plugin=HelperPlugin(), plugin_name="helpers", description="Helper functions for the realtime client.")
 
-# region: handlers for audio and data streams
+# region: Handlers for audio and data streams
 
 
 async def from_realtime_to_acs(audio: ndarray):
@@ -155,7 +155,7 @@ async def handle_realtime_messages(client: RealtimeClientBase):
                 print(f" AI:-- {event.service_event.transcript}")
 
 
-# region: Quart routes
+# region: Routes
 
 
 # WebSocket.
@@ -285,6 +285,9 @@ def home():
     return "Hello SKxACS CallAutomation!"
 
 
+# region: Main
+
+
 if __name__ == "__main__":
     app.logger.setLevel(INFO)
     app.run(port=8080)
diff --git a/python/samples/demos/call_automation/readme.md b/python/samples/demos/call_automation/readme.md
@@ -22,6 +22,7 @@ Create and activate python virtual environment and install required packages usi
 ```
 pip install -r requirements.txt
 ```
+Alternatively, if you have `uv` installed, you can ship this step.
 
 ### Setup and host your Azure DevTunnel
 
@@ -46,7 +47,10 @@ Copy the `.env.example` file to `.env` and update the following values:
 
 ## Run app locally
 
-1. Navigate to `call_automation` folder and run `main.py` in debug mode or use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal
+1. Navigate to `call_automation` folder and do one of the following to start the main application:
+   - run `main.py` in debug from your IDE 
+   - use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal. 
+   - execute `./main.py` directly (this uses `uv`, which will then install the requirements in a temporary virtual environment).
 2. Browser should pop up with the below page. If not navigate it to `http://localhost:8080/`or your dev tunnel url.
 3. Register an EventGrid Webhook for the IncomingCall(`https://<devtunnelurl>/api/incomingCall`) event that points to your devtunnel URI. Instructions [here](https://learn.microsoft.com/en-us/azure/communication-services/concepts/call-automation/incoming-call-notification).
 

diff --git a/python/samples/demos/call_automation/requirements.txt b/python/samples/demos/call_automation/requirements.txt
@@ -1,5 +1,4 @@
 Quart>=0.19.6
 azure-eventgrid==4.11.0
-aiohttp>= 3.11.9
 azure-communication-callautomation==1.4.0b1
 semantic-kernel
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py
@@ -39,6 +39,7 @@
 from semantic_kernel.contents.events.realtime_event import (
     RealtimeAudioEvent,
     RealtimeEvent,
+    RealtimeEvents,
     RealtimeFunctionCallEvent,
     RealtimeFunctionResultEvent,
     RealtimeTextEvent,
@@ -69,7 +70,7 @@ class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase):
     _current_settings: PromptExecutionSettings | None = PrivateAttr(default=None)
     _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict)
 
-    async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvent, None]:
+    async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvents, None]:
         """Handle all events but audio delta.
 
         Audio delta has to be handled by the implementation of the protocol as some
@@ -194,7 +195,7 @@ async def update_session(
     async def _parse_function_call_arguments_done(
         self,
         event: ResponseFunctionCallArgumentsDoneEvent,
-    ) -> AsyncGenerator[RealtimeEvent | None]:
+    ) -> AsyncGenerator[RealtimeEvents | None]:
         """Handle response function call done.
 
         This always yields at least 1 event, either a RealtimeEvent or a RealtimeFunctionResultEvent with the raw event.
@@ -250,7 +251,7 @@ async def _send(self, event: RealtimeClientEvent) -> None:
         raise NotImplementedError
 
     @override
-    async def send(self, event: RealtimeEvent, **kwargs: Any) -> None:
+    async def send(self, event: RealtimeEvents, **kwargs: Any) -> None:
         match event:
             case RealtimeAudioEvent():
                 await self._send(
@@ -455,7 +456,7 @@ async def create_session(
         pass
 
     @override
-    def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvent, None]:
+    def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvents, None]:
         pass
 
     @override

diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py
@@ -31,8 +31,7 @@
 from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents
 from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase
 from semantic_kernel.contents.audio_content import AudioContent
-from semantic_kernel.contents.events import RealtimeEvent
-from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent
+from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvents
 from semantic_kernel.utils.experimental_decorator import experimental_class
 
 if TYPE_CHECKING:
@@ -51,13 +50,13 @@ class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase):
     peer_connection: RTCPeerConnection | None = None
     data_channel: RTCDataChannel | None = None
     audio_track: MediaStreamTrack | None = None
-    _receive_buffer: asyncio.Queue[RealtimeEvent] = PrivateAttr(default_factory=asyncio.Queue)
+    _receive_buffer: asyncio.Queue[RealtimeEvents] = PrivateAttr(default_factory=asyncio.Queue)
 
     @override
     async def receive(
         self,
         **kwargs: Any,
-    ) -> AsyncGenerator[RealtimeEvent, None]:
+    ) -> AsyncGenerator[RealtimeEvents, None]:
         while True:
             event = await self._receive_buffer.get()
             yield event

diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py
@@ -20,7 +20,7 @@
 from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents
 from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase
 from semantic_kernel.contents.audio_content import AudioContent
-from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvent
+from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvents
 from semantic_kernel.utils.experimental_decorator import experimental_class
 
 if TYPE_CHECKING:
@@ -42,7 +42,7 @@ class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase):
     async def receive(
         self,
         **kwargs: Any,
-    ) -> AsyncGenerator[RealtimeEvent, None]:
+    ) -> AsyncGenerator[RealtimeEvents, None]:
         await self.connected.wait()
         if not self.connection:
             raise ValueError("Connection is not established.")

diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py
@@ -5,20 +5,19 @@
 from collections.abc import AsyncGenerator, Callable, Coroutine
 from typing import Any, ClassVar
 
-from pydantic import PrivateAttr
-
 if sys.version_info >= (3, 11):
     from typing import Self  # pragma: no cover
 else:
     from typing_extensions import Self  # pragma: no cover
 
 from numpy import ndarray
+from pydantic import PrivateAttr
 
 from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration
 from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType
 from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
 from semantic_kernel.contents.chat_history import ChatHistory
-from semantic_kernel.contents.events.realtime_event import RealtimeEvent
+from semantic_kernel.contents.events.realtime_event import RealtimeEvents
 from semantic_kernel.services.ai_service_client_base import AIServiceClientBase
 from semantic_kernel.utils.experimental_decorator import experimental_class
 
@@ -34,7 +33,7 @@ class RealtimeClientBase(AIServiceClientBase, ABC):
     _create_kwargs: dict[str, Any] | None = PrivateAttr(default=None)
 
     @abstractmethod
-    async def send(self, event: RealtimeEvent) -> None:
+    async def send(self, event: RealtimeEvents) -> None:
         """Send an event to the service.
 
         Args:
@@ -47,7 +46,7 @@ async def send(self, event: RealtimeEvent) -> None:
     def receive(
         self,
         **kwargs: Any,
-    ) -> AsyncGenerator[RealtimeEvent, None]:
+    ) -> AsyncGenerator[RealtimeEvents, None]:
         """Starts listening for messages from the service, generates events.
 
         Args:

diff --git a/python/semantic_kernel/contents/audio_content.py b/python/semantic_kernel/contents/audio_content.py
@@ -86,8 +86,3 @@ def from_audio_file(cls: type[_T], path: str) -> _T:
     def to_dict(self) -> dict[str, Any]:
         """Convert the instance to a dictionary."""
         return {"type": "audio_url", "audio_url": {"uri": str(self)}}
-
-    @classmethod
-    def from_ndarray(cls: type[_T], data: ndarray, mime_type: str) -> _T:
-        """Create an instance from an ndarray."""
-        return cls(data=data, mime_type=mime_type)
diff --git a/python/semantic_kernel/contents/events/__init__.py b/python/semantic_kernel/contents/events/__init__.py
@@ -3,6 +3,7 @@
 from semantic_kernel.contents.events.realtime_event import (
     RealtimeAudioEvent,
     RealtimeEvent,
+    RealtimeEvents,
     RealtimeFunctionCallEvent,
     RealtimeFunctionResultEvent,
     RealtimeImageEvent,
@@ -12,6 +13,7 @@
 __all__ = [
     "RealtimeAudioEvent",
     "RealtimeEvent",
+    "RealtimeEvents",
     "RealtimeFunctionCallEvent",
     "RealtimeFunctionResultEvent",
     "RealtimeImageEvent",

diff --git a/python/semantic_kernel/contents/events/realtime_event.py b/python/semantic_kernel/contents/events/realtime_event.py
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft. All rights reserved.
 
-from typing import Any, ClassVar, Literal
+from typing import Annotated, Any, ClassVar, Literal, Union
 
 from pydantic import Field
 
@@ -11,6 +11,18 @@
 from semantic_kernel.contents.text_content import TextContent
 from semantic_kernel.kernel_pydantic import KernelBaseModel
 
+RealtimeEvents = Annotated[
+    Union[
+        "RealtimeEvent",
+        "RealtimeAudioEvent",
+        "RealtimeTextEvent",
+        "RealtimeFunctionCallEvent",
+        "RealtimeFunctionResultEvent",
+        "RealtimeImageEvent",
+    ],
+    Field(discriminator="event_type"),
+]
+
 
 class RealtimeEvent(KernelBaseModel):
     """Base class for all service events."""