Skip to content

Commit ddf5edd

Browse files
committed
OCR'ing navigate page separately from preview page.
1 parent fd90eea commit ddf5edd

File tree

2 files changed

+34
-79
lines changed

2 files changed

+34
-79
lines changed

handlers/navigation_handler.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -321,10 +321,7 @@ def navigate(
321321
if progress and progress.get("time_left"):
322322
# Try to cycle to get page/location information
323323
page_text = progress.get("reading_time_indicator", "")
324-
percentage = progress.get("percentage", "")
325-
updated_progress = cycle_page_indicator_if_needed(
326-
self.automator.driver, page_text, f"{percentage}%" if percentage else None
327-
)
324+
updated_progress = cycle_page_indicator_if_needed(self.automator.driver, page_text)
328325
if updated_progress:
329326
screenshot_data["progress"] = updated_progress
330327

@@ -390,6 +387,13 @@ def navigate(
390387
if not success:
391388
return {"error": "Navigation failed"}, 500
392389

390+
# After navigation, capture page info at the user's actual position (not preview position)
391+
# This is only needed when we're doing a preview, to get the correct page number
392+
navigation_page_info = None
393+
if success and preview_count != 0:
394+
# Extract just the page indicator at the navigation position
395+
_, navigation_page_info, _ = self._extract_screenshot_for_ocr(f"nav_pos")
396+
393397
# If preview was requested, handle it after navigating
394398
preview_ocr_text = None
395399
preview_page_info = None
@@ -404,10 +408,10 @@ def navigate(
404408
)
405409

406410
if preview_success and preview_ocr_text:
407-
# Use the page info from the preview if available, otherwise get current page data
408-
if preview_page_info:
409-
# Use the page info extracted from OCR of the preview page
410-
progress = preview_page_info
411+
# Use the page info from the navigation position, NOT the preview position
412+
if navigation_page_info:
413+
# Use the page info extracted at navigation position
414+
progress = navigation_page_info
411415
else:
412416
# Fall back to getting current page data after navigation
413417
progress = self.automator.state_machine.reader_handler.get_reading_progress(
@@ -464,10 +468,7 @@ def navigate(
464468
if progress and progress.get("time_left"):
465469
# Try to cycle to get page/location information
466470
page_text = progress.get("reading_time_indicator", "")
467-
percentage = progress.get("percentage", "")
468-
updated_progress = cycle_page_indicator_if_needed(
469-
self.automator.driver, page_text, f"{percentage}%" if percentage else None
470-
)
471+
updated_progress = cycle_page_indicator_if_needed(self.automator.driver, page_text)
471472
if updated_progress:
472473
screenshot_data["progress"] = updated_progress
473474

@@ -795,14 +796,11 @@ def _extract_screenshot_for_ocr(self, prefix: str) -> Tuple[Optional[str], Optio
795796

796797
# Extract page information from OCR results
797798
page_indicator_text = ocr_results.get("page_indicator_text")
798-
percentage_text = ocr_results.get("percentage_text")
799799

800800
# Parse the page indicators and handle time-based indicators
801-
if page_indicator_text or percentage_text:
801+
if page_indicator_text:
802802
# Use the cycle function which will tap if needed for time-based indicators
803-
page_info = cycle_page_indicator_if_needed(
804-
self.automator.driver, page_indicator_text, percentage_text
805-
)
803+
page_info = cycle_page_indicator_if_needed(self.automator.driver, page_indicator_text)
806804
else:
807805
page_info = None
808806

server/utils/ocr_utils.py

Lines changed: 19 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -335,14 +335,14 @@ def process_ocr(image_content, clean_ui_elements=True) -> Tuple[Optional[str], O
335335
return None, combined_error
336336

337337

338-
def extract_page_indicator_regions(image_bytes):
339-
"""Extract the page indicator and percentage regions from a screenshot.
338+
def extract_page_indicator_region(image_bytes):
339+
"""Extract the page indicator region from a screenshot.
340340
341341
Args:
342342
image_bytes: The screenshot as bytes
343343
344344
Returns:
345-
tuple: (page_indicator_bytes, percentage_bytes) - Cropped regions as bytes
345+
bytes: page_indicator_bytes - Cropped region as bytes
346346
"""
347347
from io import BytesIO
348348

@@ -353,7 +353,7 @@ def extract_page_indicator_regions(image_bytes):
353353
img = Image.open(BytesIO(image_bytes))
354354
width, height = img.size
355355

356-
# Define crop regions based on proportions
356+
# Define crop region based on proportions
357357
# Bottom-left for page/location indicator
358358
# The page number is in the bottom 6% of screen (bottom 80px of 1400px)
359359
page_indicator_box = (
@@ -363,46 +363,31 @@ def extract_page_indicator_regions(image_bytes):
363363
height, # Bottom edge
364364
)
365365

366-
# Bottom-right for percentage
367-
percentage_box = (
368-
int(width * 0.5), # Start at 50% from left
369-
int(height * 0.94), # Start at 94% from top (bottom 6%)
370-
width, # Right edge
371-
height, # Bottom edge
372-
)
373-
374-
# Crop the regions
366+
# Crop the region
375367
page_indicator_img = img.crop(page_indicator_box)
376-
percentage_img = img.crop(percentage_box)
377368

378369
# Convert back to bytes
379370
page_indicator_bytes = BytesIO()
380371
page_indicator_img.save(page_indicator_bytes, format="PNG")
381372
page_indicator_bytes = page_indicator_bytes.getvalue()
382373

383-
percentage_bytes = BytesIO()
384-
percentage_img.save(percentage_bytes, format="PNG")
385-
percentage_bytes = percentage_bytes.getvalue()
386-
387374
logger.debug(f"Cropped page indicator region: {page_indicator_box}")
388-
logger.debug(f"Cropped percentage region: {percentage_box}")
389375

390-
return page_indicator_bytes, percentage_bytes
376+
return page_indicator_bytes
391377

392378
except Exception as e:
393-
logger.error(f"Error extracting page indicator regions: {e}", exc_info=True)
394-
return None, None
379+
logger.error(f"Error extracting page indicator region: {e}", exc_info=True)
380+
return None
395381

396382

397-
def parse_page_indicators(page_indicator_text, percentage_text):
398-
"""Parse page indicator and percentage text to extract structured progress data.
383+
def parse_page_indicators(page_indicator_text):
384+
"""Parse page indicator text to extract structured progress data.
399385
400386
Args:
401387
page_indicator_text: OCR text from page indicator region (e.g., "Page 123 of 456", "8 mins left in chapter")
402-
percentage_text: OCR text from percentage region (e.g., "87%")
403388
404389
Returns:
405-
dict: Progress information with current_page/location, total_pages/locations, percentage, and/or time_left
390+
dict: Progress information with current_page/location, total_pages/locations, and/or time_left
406391
"""
407392
progress = {}
408393

@@ -435,17 +420,6 @@ def parse_page_indicators(page_indicator_text, percentage_text):
435420
progress["current_page"] = None
436421
progress["total_pages"] = None
437422

438-
# Parse percentage
439-
if percentage_text:
440-
percentage_match = re.search(r"(\d+)%", percentage_text)
441-
if percentage_match:
442-
progress["percentage"] = int(percentage_match.group(1))
443-
logger.info(f"Extracted percentage: {progress['percentage']}%")
444-
else:
445-
progress["percentage"] = None
446-
else:
447-
progress["percentage"] = None
448-
449423
return progress
450424

451425

@@ -456,13 +430,13 @@ def process_screenshot_with_regions(image_bytes):
456430
image_bytes: The screenshot as bytes
457431
458432
Returns:
459-
dict: Contains 'main_text', 'page_indicator_text', 'percentage_text', and any errors
433+
dict: Contains 'main_text', 'page_indicator_text', and any errors
460434
"""
461435
from io import BytesIO
462436

463437
from PIL import Image
464438

465-
result = {"main_text": None, "page_indicator_text": None, "percentage_text": None, "errors": []}
439+
result = {"main_text": None, "page_indicator_text": None, "errors": []}
466440

467441
try:
468442
# Load the image once
@@ -489,8 +463,8 @@ def process_screenshot_with_regions(image_bytes):
489463
elif main_error:
490464
result["errors"].append(f"Main text OCR error: {main_error}")
491465

492-
# Extract page indicator regions
493-
page_indicator_bytes, percentage_bytes = extract_page_indicator_regions(image_bytes)
466+
# Extract page indicator region
467+
page_indicator_bytes = extract_page_indicator_region(image_bytes)
494468

495469
# OCR page indicator
496470
if page_indicator_bytes:
@@ -503,19 +477,6 @@ def process_screenshot_with_regions(image_bytes):
503477
elif page_error:
504478
result["errors"].append(f"Page indicator OCR error: {page_error}")
505479

506-
# OCR percentage
507-
if percentage_bytes:
508-
percent_text, percent_error = KindleOCR.process_ocr(percentage_bytes, clean_ui_elements=False)
509-
if percent_text:
510-
# Clean up the text - remove any extra whitespace
511-
percent_text = percent_text.strip()
512-
result["percentage_text"] = percent_text
513-
logger.info(f"OCR: Percentage extracted: '{percent_text}'")
514-
elif percent_error:
515-
result["errors"].append(f"Percentage OCR error: {percent_error}")
516-
else:
517-
logger.warning("OCR: No percentage bytes extracted")
518-
519480
return result
520481

521482
except Exception as e:
@@ -626,19 +587,18 @@ def is_ocr_requested(default=False):
626587
return perform_ocr
627588

628589

629-
def cycle_page_indicator_if_needed(driver, page_indicator_text, percentage_text=None):
590+
def cycle_page_indicator_if_needed(driver, page_indicator_text):
630591
"""If time-based indicator is detected, tap to cycle through formats to get page/location.
631592
632593
Args:
633594
driver: The Appium driver instance
634595
page_indicator_text: The OCR'd text from the page indicator region
635-
percentage_text: Optional percentage text
636596
637597
Returns:
638598
dict: Updated progress information with page/location data if found
639599
"""
640600
# First parse what we have
641-
progress = parse_page_indicators(page_indicator_text, percentage_text)
601+
progress = parse_page_indicators(page_indicator_text)
642602

643603
# Check if we got a time-based indicator or "Learning reading speed" instead of page/location
644604
if (
@@ -710,7 +670,6 @@ def process_screenshot_response(screenshot_id, screenshot_path, use_base64=False
710670

711671
ocr_text = ocr_results.get("main_text")
712672
page_indicator_text = ocr_results.get("page_indicator_text")
713-
percentage_text = ocr_results.get("percentage_text")
714673
errors = ocr_results.get("errors", [])
715674

716675
if ocr_text:
@@ -720,14 +679,12 @@ def process_screenshot_response(screenshot_id, screenshot_path, use_base64=False
720679
logger.info(f"OCR text extracted successfully, length: {len(ocr_text)} characters")
721680

722681
# Log what we got from the page regions
723-
logger.info(
724-
f"Page indicator text: '{page_indicator_text}', Percentage text: '{percentage_text}'"
725-
)
682+
logger.info(f"Page indicator text: '{page_indicator_text}'")
726683

727684
# Parse and add page progress information if extracted
728685
# Note: We can't use cycle_page_indicator_if_needed here because we don't have access to the driver
729686
# The cycling should be handled by the calling code that has access to the driver
730-
progress = parse_page_indicators(page_indicator_text, percentage_text)
687+
progress = parse_page_indicators(page_indicator_text)
731688

732689
# Log the parsed progress
733690
logger.info(f"Parsed progress: {progress}")

0 commit comments

Comments
 (0)