cache key consistency

wdm0006 · wdm0006 · commit ec385d07880a · 2025-06-21T21:45:22.000-04:00
diff --git a/docs/source/cache.rst b/docs/source/cache.rst
@@ -173,6 +173,93 @@ The cache timestamp functionality is fully backward compatible:
 * No changes to Repository or ProjectDirectory APIs
 * All existing code continues to work unchanged
 
+Best Practices
+--------------
+
+Shared Cache Usage
+~~~~~~~~~~~~~~~~~~
+
+.. warning::
+   **Recommendation: Use Separate Cache Instances**
+   
+   While it's technically possible to share the same cache object across multiple Repository instances, 
+   we **strongly recommend using separate cache instances** for each repository for the following reasons:
+
+**Recommended Approach - Separate Caches:**
+
+.. code-block:: python
+
+    from gitpandas import Repository
+    from gitpandas.cache import DiskCache
+    
+    # Create separate cache instances for each repository
+    cache1 = DiskCache(filepath='repo1_cache.gz')
+    cache2 = DiskCache(filepath='repo2_cache.gz')
+    
+    repo1 = Repository('/path/to/repo1', cache_backend=cache1)
+    repo2 = Repository('/path/to/repo2', cache_backend=cache2)
+
+**Benefits of Separate Caches:**
+
+* **Complete Isolation**: No risk of cache eviction conflicts between repositories
+* **Predictable Memory Usage**: Each repository has its own memory budget
+* **Easier Debugging**: Cache issues are isolated to specific repositories  
+* **Better Performance**: No lock contention in multi-threaded scenarios
+* **Clear Cache Management**: You can clear or manage each repository's cache independently
+
+**If You Must Share Caches:**
+
+If you need to share a cache object across multiple repositories (e.g., for memory constraints), 
+the system is designed to handle this safely:
+
+.. code-block:: python
+
+    from gitpandas import Repository
+    from gitpandas.cache import EphemeralCache
+    
+    # Shared cache (not recommended but supported)
+    shared_cache = EphemeralCache(max_keys=1000)
+    
+    repo1 = Repository('/path/to/repo1', cache_backend=shared_cache)
+    repo2 = Repository('/path/to/repo2', cache_backend=shared_cache)
+    
+    # Each repository gets separate cache entries
+    files1 = repo1.list_files()  # Creates cache key: list_files||repo1||None
+    files2 = repo2.list_files()  # Creates cache key: list_files||repo2||None
+
+**Shared Cache Considerations:**
+
+* Repository names are included in cache keys to prevent collisions
+* Cache eviction affects all repositories sharing the cache
+* Memory usage is shared across all repositories
+* Very active repositories may evict cache entries from less active ones
+
+Cache Size Planning
+~~~~~~~~~~~~~~~~~~~
+
+When planning cache sizes, consider:
+
+* **Repository Size**: Larger repositories generate more cache entries
+* **Operation Types**: Some operations (like ``cumulative_blame``) create many cache entries
+* **Memory Constraints**: Balance cache size with available system memory
+* **Analysis Patterns**: Frequently repeated analyses benefit from larger caches
+
+**Recommended Cache Sizes:**
+
+.. code-block:: python
+
+    # Small repositories (< 1000 commits)
+    cache = EphemeralCache(max_keys=100)
+    
+    # Medium repositories (1000-10000 commits)  
+    cache = EphemeralCache(max_keys=500)
+    
+    # Large repositories (> 10000 commits)
+    cache = EphemeralCache(max_keys=1000)
+    
+    # For disk/Redis caches, you can use larger sizes
+    cache = DiskCache(filepath='cache.gz', max_keys=5000)
+
 API Reference
 -------------
 
diff --git a/gitpandas/cache.py b/gitpandas/cache.py
@@ -83,8 +83,9 @@ def deco(self, *args, **kwargs):
             force_refresh = is_propagated_force or explicit_force_refresh
 
             # Generate the cache key (ensure force_refresh itself is not part of the key)
+            # Use || as delimiter to avoid conflicts with repository names containing underscores
             key_parts = [str(kwargs.get(k)) for k in key_list]
-            key = f"{key_prefix}_{self.repo_name}_{'_'.join(key_parts)}"
+            key = f"{key_prefix}||{self.repo_name}||{'_'.join(key_parts)}"
             logging.debug(f"Cache key generated for {key_prefix}: {key}")
 
             # Explicitly log force refresh bypass of cache read
diff --git a/tests/test_cache.py b/tests/test_cache.py
@@ -571,9 +571,9 @@ def mock_set(k, v):
         assert len(captured_keys) == 1
         key = captured_keys[0]
 
-        # Key should have proper separators
+        # Key should have proper separators (new format uses ||)
         assert key.startswith("test_method_")
-        assert "_test/repo_" in key
+        assert "||test/repo||" in key
 
         # Key should contain parameter values
         assert "val1" in key
diff --git a/tests/test_cache_key_consistency.py b/tests/test_cache_key_consistency.py
@@ -136,8 +136,8 @@ def mock_set(k, v):
 
         # Keys should be different because repo_name is different
         assert key1 != key2
-        assert "/path/to/repo_" in key1
-        assert "/path/to/repo/_" in key2
+        assert "||/path/to/repo||" in key1
+        assert "||/path/to/repo/||" in key2
 
     def test_complex_key_generation(self, temp_cache_path):
         """Test key generation with complex parameters"""
@@ -159,10 +159,10 @@ def mock_set(k, v):
 
         # Check key format
         key = captured_keys[0]
-        assert key.startswith("complex_method_")
-        assert "_value1_" in key
-        assert "_value2_" in key
-        assert "_value3" in key
+        assert key.startswith("complex_method||")
+        assert "value1_" in key
+        assert "value2_" in key
+        assert "value3" in key
 
         # Call again with different order of parameters in the call
         # Python should normalize kwargs, so the key should be the same