diff --git a/Cargo.toml b/Cargo.toml index 129ca9966..eca79df6a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -107,4 +107,6 @@ image = "0.25.2" headless_chrome = "1.0.15" nix = { version = "0.29.0", features = ["signal"] } resvg = "0.44.0" -async-tar = "0.5.0" \ No newline at end of file +async-tar = "0.5.0" +tokio-tungstenite = "0.24.0" +tungstenite = "0.24.0" diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 4f4075169..8c233cdb8 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -16,12 +16,28 @@ use crate::tools::tools_description::{Tool, ToolDesc, ToolParam}; use reqwest::Client; use std::path::PathBuf; -use headless_chrome::{Browser, LaunchOptions, Tab}; +use headless_chrome::{Browser, LaunchOptions, Tab as HeadlessTab}; use headless_chrome::browser::tab::point::Point; use headless_chrome::protocol::cdp::Page; use headless_chrome::protocol::cdp::Emulation; +use headless_chrome::protocol::cdp::types::Event; use serde::{Deserialize, Serialize}; +use std::sync::Mutex; +use std::fmt; +use tokio::time::sleep; +use chrono::DateTime; +use headless_chrome::protocol::cdp::Target; +use headless_chrome::protocol::cdp::types::Method; + +use std::net::TcpStream; +use tungstenite::protocol::WebSocketConfig; +use tungstenite::stream::MaybeTlsStream; + +use base64::Engine; +use std::io::Cursor; +use image::imageops::FilterType; +use image::{ImageFormat, ImageReader}; #[derive(Clone, Serialize, Deserialize, Debug)] pub struct IntegrationChrome { @@ -39,9 +55,50 @@ pub struct ToolChrome { supports_clicks: bool, } +#[derive(Clone, Debug)] +enum DeviceType { + DESKTOP, + MOBILE, +} + +impl fmt::Display for DeviceType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + DeviceType::DESKTOP => write!(f, "desktop"), + DeviceType::MOBILE => write!(f, "mobile"), + } + } +} + +#[derive(Clone)] +pub struct ChromeTab { + headless_tab: Arc, + device: DeviceType, + tab_id: String, + screenshot_scale_factor: f64, + // NOTE: logs vector should be at least limited + tab_log: Arc>>, +} + +impl ChromeTab { + fn new(headless_tab: Arc, device: &DeviceType, tab_id: &String) -> Self { + Self { + headless_tab, + device: device.clone(), + tab_id: tab_id.clone(), + screenshot_scale_factor: 1.0, + tab_log: Arc::new(Mutex::new(Vec::new())), + } + } + pub fn state_string(&self) -> String { + format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.headless_tab.get_url()) + } +} + struct ChromeSession { browser: Browser, - tabs: HashMap>, + tabs: HashMap>>, + web_socket_addr: Option, } impl ChromeSession { @@ -97,37 +154,43 @@ impl Tool for ToolChrome { None => return Err("Missing argument `commands`".to_string()) }; - let mut content = vec![]; + let session_hashmap_key = get_session_hashmap_key("chrome", &chat_id); + let mut tool_log = setup_chrome_session(gcx.clone(), &self.integration_chrome, &session_hashmap_key).await?; + + let command_session = { + let gcx_locked = gcx.read().await; + gcx_locked.integration_sessions.get(&session_hashmap_key) + .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? + .clone() + }; + + let mut mutlimodal_els = vec![]; for command in commands_str.lines().map(|s| s.trim()).collect::>() { let parsed_command = match parse_single_command(&command.to_string()) { Ok(command) => command, Err(e) => { - content.push(MultimodalElement::new( - "text".to_string(), - format!("Failed to parse command: {}. Error: {}.", command, e) - )?); + tool_log.push(format!("failed to parse command `{}`: {}.", command, e)); break } }; - match interact_with_chrome( - gcx.clone(), - &chat_id, - &self.integration_chrome, - &parsed_command, - ).await { - Ok(command_content) => { - content.extend(command_content); + match chrome_command_exec(&parsed_command, command_session.clone()).await { + Ok((execute_log, command_multimodal_els)) => { + tool_log.extend(execute_log); + mutlimodal_els.extend(command_multimodal_els); }, Err(e) => { - content.push(MultimodalElement::new( - "text".to_string(), - format!("Failed to execute command: {}. Error: {}.", command, e) - )?); + tool_log.push(format!("failed to execute command `{}`: {}.", command, e)); break } }; } + let mut content= vec![]; + content.push(MultimodalElement::new( + "text".to_string(), tool_log.join("\n") + )?); + content.extend(mutlimodal_els); + let msg = ContextEnum::ChatMessage(ChatMessage { role: "tool".to_string(), content: ChatContent::Multimodal(content), @@ -140,15 +203,26 @@ impl Tool for ToolChrome { } fn tool_description(&self) -> ToolDesc { - let mut commands_desc = r#"One or several commands separated by newline. The is an integer, for example 10, for you to identify the tab later. Supported commands: -navigate_to -screenshot -html -reload -device "#.to_string(); + let mut supported_commands = vec![ + "open_tab ", + "navigate_to ", + "screenshot ", + // "html ", + "reload ", + "press_key_at ", + "type_text_at ", + "tab_log ", + "chrome_dev_tools_protocol ", + ]; if self.supports_clicks { - commands_desc = format!("{}\nclick \ninsert_text \n", commands_desc); + supported_commands.extend(vec![ + "click_at ", + ]); } + let description = format!( + "One or several commands separated by newline. \ + The is an integer, for example 10, for you to identify the tab later. \ + Supported commands:\n{}", supported_commands.join("\n")); ToolDesc { name: "chrome".to_string(), agentic: true, @@ -157,7 +231,7 @@ device "#.to_string(); parameters: vec![ToolParam { name: "commands".to_string(), param_type: "string".to_string(), - description: commands_desc, + description, }], parameters_required: vec!["commands".to_string()], } @@ -217,148 +291,422 @@ async fn setup_chrome_session( // NOTE: we're not register any tabs because they can be used by another chat setup_log.push("No opened tabs.".to_string()); - let command_session: Box = Box::new(ChromeSession { browser, tabs: HashMap::new() }); + let command_session: Box = Box::new(ChromeSession { + browser, + tabs: HashMap::new(), + web_socket_addr: args.chrome_path.clone(), + }); gcx.write().await.integration_sessions.insert( session_hashmap_key.clone(), Arc::new(AMutex::new(command_session)) ); Ok(setup_log) } -async fn navigate_to(tab: &Arc, url: &String) -> Result { - tab.navigate_to(url.as_str()).map_err(|e| e.to_string())?; - tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(format!("Chrome tab navigated to {}", tab.get_url())) -} +async fn screenshot_jpeg_base64( + tab: Arc>, + capture_beyond_viewport: bool, +) -> Result { + let jpeg_base64_data = { + let tab_lock = tab.lock().await; + tab_lock.headless_tab.call_method(Page::CaptureScreenshot { + format: Some(Page::CaptureScreenshotFormatOption::Jpeg), + clip: None, + quality: Some(75), + from_surface: Some(true), + capture_beyond_viewport: Some(capture_beyond_viewport), + }).map_err(|e| e.to_string())?.data + }; -async fn click_on_point(tab: &Arc, point: &Point) -> Result { - tab.click_point(point.clone()).map_err(|e| e.to_string())?; - tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(format!("clicked on `{} {}`", point.x, point.y)) -} + let mut data = base64::prelude::BASE64_STANDARD + .decode(jpeg_base64_data).map_err(|e| e.to_string())?; + let reader = ImageReader::with_format(Cursor::new(data), ImageFormat::Jpeg); + let mut image = reader.decode().map_err(|e| e.to_string())?; + + let max_dimension = 800.0; + let scale_factor = max_dimension / std::cmp::max(image.width(), image.height()) as f32; + if scale_factor < 1.0 { + // NOTE: the tool operates on resized image well without a special model notification + let (nwidth, nheight) = (scale_factor * image.width() as f32, scale_factor * image.height() as f32); + image = image.resize(nwidth as u32, nheight as u32, FilterType::Lanczos3); + // NOTE: we should store screenshot_scale_factor for every resized screenshot, not for a tab! + let mut tab_lock = tab.lock().await; + tab_lock.screenshot_scale_factor = scale_factor as f64; + } + + data = Vec::new(); + image.write_to(&mut Cursor::new(&mut data), ImageFormat::Jpeg).map_err(|e| e.to_string())?; -async fn insert_text(tab: &Arc, text: &String) -> Result { - tab.type_str(text.as_str()).map_err(|e| e.to_string())?; - Ok(format!("inserted text `{}`", text.clone())) + MultimodalElement::new("image/jpeg".to_string(), base64::prelude::BASE64_STANDARD.encode(data)) } async fn session_open_tab( chrome_session: &mut ChromeSession, - tab_name: &String, -) -> Result<(Arc, String), String> { - match chrome_session.tabs.get(tab_name) { + tab_id: &String, + device: &DeviceType, +) -> Result { + match chrome_session.tabs.get(tab_id) { Some(tab) => { - Ok((tab.clone(), format!("Using opened tab {}\n", tab_name.clone()))) + let tab_lock = tab.lock().await; + Err(format!("Tab is already opened: {}\n", tab_lock.state_string())) }, None => { - let tab = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; - chrome_session.tabs.insert(tab_name.clone(), tab.clone()); - Ok((tab, format!("Opened new tab {}\n", tab_name.clone()))) + let headless_tab = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; + match device { + DeviceType::MOBILE => { + headless_tab.call_method(Emulation::SetDeviceMetricsOverride { + width: 375, + height: 812, + device_scale_factor: 0.0, + mobile: true, + scale: None, + screen_width: None, + screen_height: None, + position_x: None, + position_y: None, + dont_set_visible_size: None, + screen_orientation: None, + viewport: None, + display_feature: None, + }).map_err(|e| e.to_string())?; + }, + DeviceType::DESKTOP => { + headless_tab.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; + } + } + let tab = Arc::new(AMutex::new(ChromeTab::new(headless_tab, device, tab_id))); + let tab_lock = tab.lock().await; + let tab_log = Arc::clone(&tab_lock.tab_log); + tab_lock.headless_tab.enable_log().map_err(|e| e.to_string())?; + tab_lock.headless_tab.add_event_listener(Arc::new(move |event: &Event| { + if let Event::LogEntryAdded(e) = event { + let formatted_ts = { + let dt = DateTime::from_timestamp(e.params.entry.timestamp as i64, 0).unwrap(); + dt.format("%Y-%m-%d %H:%M:%S").to_string() + }; + tab_log.lock().unwrap().push(format!("{} [{:?}]: {}", formatted_ts, e.params.entry.level, e.params.entry.text)); + } + })).map_err(|e| e.to_string())?; + chrome_session.tabs.insert(tab_id.clone(), tab.clone()); + Ok(format!("opened a new tab: {}\n", tab_lock.state_string())) } } } +async fn session_get_tab_arc( + chrome_session: &ChromeSession, + tab_id: &String, +) -> Result>, String> { + match chrome_session.tabs.get(tab_id) { + Some(tab) => Ok(tab.clone()), + None => Err(format!("tab_id {} is not opened", tab_id)), + } +} + +pub fn websocket_connection( + ws_url: &String, +) -> Result>, String> { + let mut client = tungstenite::client::connect_with_config( + ws_url.as_str(), + Some(WebSocketConfig { + max_message_size: None, + max_frame_size: None, + accept_unmasked_frames: true, + ..Default::default() + }), + u8::MAX - 1, + ).map_err(|e| e.to_string())?; + let stream = client.0.get_mut(); + let stream = match stream { + MaybeTlsStream::Plain(s) => s, + _ => todo!(), + }; + stream.set_read_timeout(Some(Duration::from_millis(100))).map_err(|e| e.to_string())?; + Ok(client.0) +} + +async fn raw_cdp_call_method( + web_socket_addr: &Option, + target_id: &String, + method: &Value, +) -> Result { + let mut connection = { + if let Some(url) = web_socket_addr { + websocket_connection(&url).map_err(|e| e.to_string())? + } else { + todo!("we can't get ws address directly from browser") + } + }; + let session_id = { + let target_method = Target::AttachToTarget { + target_id: target_id.clone(), + flatten: None, + }; + let message_text = serde_json::to_string(&target_method.to_method_call(9001)).map_err(|e| e.to_string())?; + let message = tungstenite::protocol::Message::text(message_text); + connection.send(message).map_err(|e| e.to_string())?; + let result = connection.read().map_err(|e| e.to_string())?; + let json_result = result.to_string().parse::().map_err(|e| e.to_string())?; + if let Value::String(session_id) = json_result["params"]["sessionId"].clone() { + session_id + } else { + return Err(format!("Failed to get session_id for {}", target_id)); + } + }; + let result = { + let mut target_message = method.clone(); + target_message["id"] = Value::Number(serde_json::Number::from(9002)); + let target_method = Target::SendMessageToTarget { + message: target_message.to_string(), + target_id: None, + session_id: Some(session_id), + }; + let message_text = serde_json::to_string(&target_method.to_method_call(9003)).map_err(|e| e.to_string())?; + let message = tungstenite::protocol::Message::text(message_text); + connection.send(message).map_err(|e| e.to_string())?; + connection.read().map_err(|e| e.to_string())? + }; + Ok(result.to_string()) +} + #[derive(Debug)] enum Command { - // TODO: probably we need connect command - // if we're tying to operate on non-existing tab (no connection or something like this) - // we should not auto-open connection again + OpenTab(OpenTabArgs), NavigateTo(NavigateToArgs), Screenshot(ScreenshotArgs), Html(HtmlArgs), Reload(ReloadArgs), - Device(DeviceArgs), - Click(ClickArgs), - InsertText(InsertTextArgs), + ClickAt(ClickAtArgs), + TypeTextAt(TypeTextAtArgs), + PressKeyAt(PressKeyAtArgs), + TabLog(TabLogArgs), + TabCDP(TabCDPArgs), } -impl Command { - pub async fn execute( - &self, - chrome_session: &mut ChromeSession - ) -> Result<(Vec, Vec), String> { - let mut tool_log = vec![]; - let mut multimodal_els = vec![]; - - match self { - Command::NavigateTo(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = navigate_to(&tab, &args.uri).await.map_err( - |e| format!("Can't navigate_to `{}` on tab `{}`: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, args.tab_id, e) - )?; - tool_log.push(content); - }, - Command::Screenshot(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let screenshot = screenshot_jpeg_base64(&tab, false).await?; - tool_log.push(format!("Made a screenshot of {}", tab.get_url())); - multimodal_els.push(screenshot); - }, - Command::Html(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let client = Client::builder() - .build() - .map_err(|e| e.to_string())?; - let url = tab.get_url(); - let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; - if !response.status().is_success() { - tool_log.push(format!("Unable to fetch url: {}; status: {}", url, response.status())); - } else { - tool_log.push(response.text().await.map_err(|e| e.to_string())?); +async fn chrome_command_exec( + cmd: &Command, + chrome_session: Arc>>, +) -> Result<(Vec, Vec), String> { + let mut tool_log = vec![]; + let mut multimodal_els = vec![]; + + match cmd { + Command::OpenTab(args) => { + let log = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_open_tab(chrome_session, &args.tab_id, &args.device).await? + }; + tool_log.push(log); + }, + Command::NavigateTo(args) => { + let tab: Arc> = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + match { + tab_lock.headless_tab.navigate_to(args.uri.as_str()).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("navigate_to successful: {}", tab_lock.state_string()) + }, + Err(e) => { + format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()) + }, } - }, - Command::Reload(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - tab.reload(false, None).map_err(|e| e.to_string())?; - tool_log.push(format!("Page `{}` on tab `{}` reloaded", tab.get_url(), args.tab_id)); - }, - Command::Device(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - match args.device { - DeviceType::MOBILE => { - tab.call_method(Emulation::SetDeviceMetricsOverride { - width: 375, - height: 812, - device_scale_factor: 0.0, - mobile: true, - scale: None, - screen_width: None, - screen_height: None, - position_x: None, - position_y: None, - dont_set_visible_size: None, - screen_orientation: None, - viewport: None, - display_feature: None, - }).map_err(|e| e.to_string())?; - tool_log.push(format!("Tab `{}` set to mobile view", args.tab_id)); + }; + tool_log.push(log); + }, + Command::Screenshot(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + // NOTE: this operation is not atomic, unfortunately + match screenshot_jpeg_base64(tab.clone(), false).await { + Ok(multimodal_el) => { + multimodal_els.push(multimodal_el); + let tab_lock = tab.lock().await; + format!("made a screenshot of {}", tab_lock.state_string()) }, - DeviceType::DESKTOP => { - tab.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; - tool_log.push(format!("Tab `{}` set to desktop view", args.tab_id)); + Err(e) => { + let tab_lock = tab.lock().await; + format!("screenshot failed for {}: {}", tab_lock.state_string(), e.to_string()) + }, + } + }; + tool_log.push(log); + }, + Command::Html(args) => { + // NOTE: removed from commands list, please rewrite me... + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + let url = tab_lock.headless_tab.get_url(); + match { + let client = Client::builder() + .build() + .map_err(|e| e.to_string())?; + let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; + if response.status().is_success() { + let html = response.text().await.map_err(|e| e.to_string())?; + Ok(html) + } else { + Err(format!("status: {}", response.status())) } + } { + Ok(html) => { + format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) + }, + Err(e) => { + format!("can't fetch innerHtml of {}: {}", tab_lock.state_string(), e.to_string()) + }, } - }, - Command::Click(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = click_on_point(&tab, &args.point).await?; - tool_log.push(content); - }, - Command::InsertText(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = insert_text(&tab, &args.text).await?; - tool_log.push(content); - }, - } - - Ok((tool_log, multimodal_els)) + }; + tool_log.push(log); + }, + Command::Reload(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + let chrome_tab = tab_lock.headless_tab.clone(); + match chrome_tab.reload(false, None) { + Ok(_) => { + format!("reload of {} successful", tab_lock.state_string()) + }, + Err(e) => { + format!("reload of {} failed: {}", tab_lock.state_string(), e.to_string()) + }, + } + }; + tool_log.push(log); + }, + Command::ClickAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + match { + let mapped_point = Point { + x: args.point.x / tab_lock.screenshot_scale_factor, + y: args.point.y / tab_lock.screenshot_scale_factor, + }; + tab_lock.headless_tab.click_point(mapped_point).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab_lock.state_string()) + }, + Err(e) => { + format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab_lock.state_string(), e.to_string()) + }, + } + }; + tool_log.push(log); + }, + Command::TypeTextAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + match tab_lock.headless_tab.type_str(args.text.as_str()) { + Ok(_) => { + format!("type `{}` at {}", args.text, tab_lock.state_string()) + }, + Err(e) => { + format!("type text failed at {}: {}", tab_lock.state_string(), e.to_string()) + }, + } + }; + tool_log.push(log); + }, + Command::PressKeyAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + match { + tab_lock.headless_tab.press_key(args.key.to_string().as_str()).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + // TODO: sometimes page isn't ready for next step + sleep(Duration::from_secs(1)).await; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("press `{}` at {}", args.key, tab_lock.state_string()) + }, + Err(e) => { + format!("press `{}` failed at {}: {}", args.key, tab_lock.state_string(), e.to_string()) + }, + } + }; + tool_log.push(log); + }, + Command::TabLog(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let tab_lock = tab.lock().await; + // NOTE: we're waiting for log to be collected for 3 seconds + sleep(Duration::from_secs(3)).await; + let mut tab_log_lock = tab_lock.tab_log.lock().unwrap(); + tool_log.extend(tab_log_lock.clone()); + tab_log_lock.clear(); + }, + Command::TabCDP(args) => { + let (tab, maybe_web_socket_addr) = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + (session_get_tab_arc(chrome_session, &args.tab_id).await?, chrome_session.web_socket_addr.clone()) + }; + let tab_lock = tab.lock().await; + let target_id = tab_lock.headless_tab.get_target_id(); + let log = match raw_cdp_call_method(&maybe_web_socket_addr, &target_id, &args.method).await { + Ok(result) => { + format!("CDP method `{}` called for {}: {}", args.method, tab_lock.state_string(), result) + }, + Err(e) => { + format!("failed to execute CDP method `{}` at {}: {}", args.method, tab_lock.state_string(), e.to_string()) + } + }; + sleep(Duration::from_millis(100)).await; + tool_log.push(log); + }, } + + Ok((tool_log, multimodal_els)) +} + +#[derive(Debug)] +struct OpenTabArgs { + device: DeviceType, + tab_id: String, } #[derive(Debug)] @@ -383,32 +731,60 @@ struct ReloadArgs { } #[derive(Debug)] -struct ClickArgs { +struct ClickAtArgs { point: Point, tab_id: String, } #[derive(Debug)] -struct InsertTextArgs { +struct TypeTextAtArgs { text: String, tab_id: String, } +#[derive(Clone, Debug)] +enum Key { + ENTER, + ESC, + PAGEUP, + PAGEDOWN, + HOME, + END, +} + +impl fmt::Display for Key { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Key::ENTER => write!(f, "Enter"), + Key::ESC => write!(f, "Escape"), + Key::PAGEUP => write!(f, "PageUp"), + Key::PAGEDOWN => write!(f, "PageDown"), + Key::HOME => write!(f, "Home"), + Key::END => write!(f, "End"), + } + } +} #[derive(Debug)] -enum DeviceType { - DESKTOP, - MOBILE, +struct PressKeyAtArgs { + key: Key, + tab_id: String, } #[derive(Debug)] -struct DeviceArgs { - device: DeviceType, +struct TabLogArgs { + // wait_secs: u32, + tab_id: String, +} + +#[derive(Debug)] +struct TabCDPArgs { tab_id: String, + method: Value, } fn parse_single_command(command: &String) -> Result { - let args = shell_words::split(&command).map_err(|e| e.to_string())?; + let args = command.split(" ").map(|e| e.to_string()).collect::>(); if args.is_empty() { return Err("Command is empty".to_string()); } @@ -416,6 +792,20 @@ fn parse_single_command(command: &String) -> Result { let (command_name, parsed_args) = (args[0].clone(), args[1..].to_vec()); match command_name.as_str() { + "open_tab" => { + if parsed_args.len() < 2 { + return Err(format!("`open_tab` requires 2 arguments: `` and `tab_id`. Provided: {:?}", parsed_args)); + } + let device = match parsed_args[0].as_str() { + "desktop" => DeviceType::DESKTOP, + "mobile" => DeviceType::MOBILE, + _ => return Err(format!("unknown device type: {}. Should be either `desktop` or `mobile`.", parsed_args[0])) + }; + Ok(Command::OpenTab(OpenTabArgs { + device, + tab_id: parsed_args[1].clone(), + })) + }, "navigate_to" => { if parsed_args.len() < 2 { return Err(format!("`navigate_to` requires 2 arguments: `uri` and `tab_id`. Provided: {:?}", parsed_args)); @@ -449,26 +839,13 @@ fn parse_single_command(command: &String) -> Result { tab_id: parsed_args[0].clone(), })) }, - "device" => { - if parsed_args.len() < 2 { - return Err(format!("`device` requires 2 arguments: `desktop|mobile` and `tab_id`. Provided: {:?}", parsed_args)); - } - Ok(Command::Device(DeviceArgs { - device: match parsed_args[0].as_str() { - "desktop" => DeviceType::DESKTOP, - "mobile" => DeviceType::MOBILE, - _ => return Err(format!("Unknown device type: {}. Should be either `desktop` or `mobile`.", parsed_args[0])) - }, - tab_id: parsed_args[1].clone(), - })) - }, - "click" => { + "click_at" => { match parsed_args.as_slice() { [x_str, y_str, tab_id] => { let x = x_str.parse::().map_err(|e| format!("Failed to parse x: {}", e))?; let y = y_str.parse::().map_err(|e| format!("Failed to parse y: {}", e))?; let point = Point { x, y }; - Ok(Command::Click(ClickArgs { + Ok(Command::ClickAt(ClickAtArgs { point, tab_id: tab_id.clone(), })) @@ -478,10 +855,10 @@ fn parse_single_command(command: &String) -> Result { } } }, - "insert_text" => { + "type_text_at" => { match parsed_args.as_slice() { [text, tab_id] => { - Ok(Command::InsertText(InsertTextArgs { + Ok(Command::TypeTextAt(TypeTextAtArgs { text: text.clone(), tab_id: tab_id.clone(), })) @@ -491,46 +868,52 @@ fn parse_single_command(command: &String) -> Result { } } }, + "press_key_at" => { + match parsed_args.as_slice() { + [key_str, tab_id] => { + let key = match key_str.to_lowercase().as_str() { + "enter" => Key::ENTER, + "esc" => Key::ESC, + "pageup" => Key::PAGEUP, + "pagedown" => Key::PAGEDOWN, + "home" => Key::HOME, + "end" => Key::END, + _ => return Err(format!("Unknown key: {}", key_str)), + }; + Ok(Command::PressKeyAt(PressKeyAtArgs { + key, + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'key', 'tab_id'".to_string()) + } + } + }, + "tab_log" => { + match parsed_args.as_slice() { + [tab_id] => { + Ok(Command::TabLog(TabLogArgs { + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'tab_id'".to_string()) + } + } + }, + "chrome_dev_tools_protocol" => { + if parsed_args.len() < 1 { + return Err("Missing 'tab_id'".to_string()); + } + let tab_id = parsed_args[0].clone(); + let method = serde_json::from_str(parsed_args[1..].join(" ").as_str()) + .map_err(|e| format!("Can't parse cdp method: {}", e.to_string()))?; + Ok(Command::TabCDP(TabCDPArgs { + tab_id, + method, + })) + } _ => Err(format!("Unknown command: {:?}.", command_name)), } } - -async fn interact_with_chrome( - gcx: Arc>, - chat_id: &String, - integration_chrome: &IntegrationChrome, - command: &Command, -) -> Result, String> { - let session_hashmap_key = get_session_hashmap_key("chrome", &chat_id); - let setup_log = setup_chrome_session(gcx.clone(), &integration_chrome, &session_hashmap_key).await?; - - let command_session = { - let gcx_locked = gcx.read().await; - gcx_locked.integration_sessions.get(&session_hashmap_key) - .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? - .clone() - }; - let mut command_session_locked = command_session.lock().await; - let chrome_session = command_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; - - let (execute_log, mut multimodal_els) = command.execute(chrome_session).await?; - - let tool_log = setup_log.iter().chain(execute_log.iter()).map(|s| s.clone()).collect::>(); - multimodal_els.push(MultimodalElement::new( - "text".to_string(), tool_log.join("\n") - )?); - - Ok(multimodal_els) -} - -async fn screenshot_jpeg_base64(tab: &Arc, capture_beyond_viewport: bool) -> Result { - let jpeg_data = tab.call_method(Page::CaptureScreenshot { - format: Some(Page::CaptureScreenshotFormatOption::Jpeg), - clip: None, - quality: Some(75), - from_surface: Some(true), - capture_beyond_viewport: Some(capture_beyond_viewport), - }).map_err(|e| e.to_string())?.data; - - MultimodalElement::new("image/jpeg".to_string(), jpeg_data) -} diff --git a/src/integrations/mod.rs b/src/integrations/mod.rs index d004f0247..8ac7254a2 100644 --- a/src/integrations/mod.rs +++ b/src/integrations/mod.rs @@ -54,7 +54,7 @@ chrome: # Or you can give it ws:// path, read more here https://developer.chrome.com/docs/devtools/remote-debugging/local-server/ # In that case start chrome with --remote-debugging-port chrome_path: "ws://127.0.0.1:6006/" - window_size: [1024, 768] + window_size: [800, 600] idle_browser_timeout: 600 diff --git a/src/tools/tool_patch_aux/tickets_parsing.rs b/src/tools/tool_patch_aux/tickets_parsing.rs index ac3271901..bdf596fcf 100644 --- a/src/tools/tool_patch_aux/tickets_parsing.rs +++ b/src/tools/tool_patch_aux/tickets_parsing.rs @@ -71,7 +71,7 @@ pub struct TicketToApply { } pub fn good_error_text(reason: &str, tickets: &Vec, resolution: Option) -> (String, Option) { - let mut text = format!("Couldn't create patch for tickets: '{}'.\nReason: {reason}", tickets.join(", ")); + let text = format!("Couldn't create patch for tickets: '{}'.\nReason: {reason}", tickets.join(", ")); if let Some(resolution) = resolution { let cd_format = format!("💿 {resolution}"); return (text, Some(cd_format))