Skip to content

Commit cc7b479

Browse files
committed
Claude: deleting existing duplicates
1 parent b5abde2 commit cc7b479

File tree

1 file changed

+182
-0
lines changed

1 file changed

+182
-0
lines changed

tools/delete_duplicate_articles.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
#!/usr/bin/env python
2+
"""
3+
Delete duplicate articles that have no user interactions.
4+
5+
Finds near-duplicate articles using simhash and deletes the duplicates
6+
if they have no bookmarks, readings, or other user references.
7+
"""
8+
9+
from datetime import datetime, timedelta
10+
from simhash import Simhash
11+
import zeeguu.core
12+
from zeeguu.core.model import Article, Language, Bookmark, UserArticle
13+
from zeeguu.api.app import create_app
14+
from zeeguu.logging import logp
15+
16+
app = create_app()
17+
app.app_context().push()
18+
19+
db_session = zeeguu.core.model.db.session
20+
21+
22+
def compute_simhash(text):
23+
"""Compute simhash for article content."""
24+
if not text:
25+
return None
26+
truncated = " ".join(text.split()[:1000])
27+
return Simhash(truncated).value
28+
29+
30+
def has_user_interactions(article):
31+
"""Check if article has any user interactions (bookmarks, readings, etc)."""
32+
# Check for bookmarks via source_id
33+
if article.source_id:
34+
bookmark_count = Bookmark.query.filter_by(source_id=article.source_id).count()
35+
if bookmark_count > 0:
36+
return True
37+
38+
# Check for user article interactions (reading history, likes, etc)
39+
user_article_count = UserArticle.query.filter_by(article_id=article.id).count()
40+
if user_article_count > 0:
41+
return True
42+
43+
return False
44+
45+
46+
def find_and_delete_duplicates(
47+
language_code=None, days_back=1, distance_threshold=5, dry_run=True
48+
):
49+
"""
50+
Find duplicate articles and delete those without user interactions.
51+
52+
Args:
53+
language_code: Only check articles in this language (None = all languages)
54+
days_back: How many days back to check for duplicates
55+
distance_threshold: Maximum hamming distance to consider duplicates
56+
dry_run: If True, only report what would be deleted without actually deleting
57+
"""
58+
59+
cutoff = datetime.now() - timedelta(days=days_back)
60+
61+
query = Article.query.filter(
62+
Article.published_time >= cutoff,
63+
Article.content.isnot(None),
64+
Article.broken == 0,
65+
)
66+
67+
if language_code:
68+
language = Language.find(language_code)
69+
query = query.filter(Article.language_id == language.id)
70+
logp(f"Checking {language.name} articles from last {days_back} days...")
71+
else:
72+
logp(f"Checking all articles from last {days_back} days...")
73+
74+
articles = query.all()
75+
logp(f"Found {len(articles)} articles to check")
76+
77+
# Compute simhashes for all articles
78+
article_hashes = []
79+
for article in articles:
80+
simhash = compute_simhash(article.content)
81+
if simhash:
82+
article_hashes.append((article, simhash))
83+
84+
logp(f"Computed {len(article_hashes)} simhashes")
85+
86+
# Group articles by feed for faster comparison
87+
from collections import defaultdict
88+
89+
by_feed = defaultdict(list)
90+
for article, simhash in article_hashes:
91+
by_feed[article.feed_id].append((article, simhash))
92+
93+
logp(f"Articles spread across {len(by_feed)} feeds")
94+
95+
# Find duplicates within each feed
96+
duplicates_to_delete = []
97+
seen = set()
98+
99+
for feed_id, feed_articles in by_feed.items():
100+
logp(f"Checking feed {feed_id} ({len(feed_articles)} articles)...")
101+
102+
for i, (article1, hash1) in enumerate(feed_articles):
103+
if article1.id in seen:
104+
continue
105+
106+
for j, (article2, hash2) in enumerate(feed_articles):
107+
if i >= j or article2.id in seen:
108+
continue
109+
110+
distance = Simhash(hash1).distance(Simhash(hash2))
111+
112+
if distance <= distance_threshold:
113+
# Found a duplicate pair - decide which to keep
114+
older = (
115+
article1
116+
if article1.published_time < article2.published_time
117+
else article2
118+
)
119+
newer = article2 if older == article1 else article1
120+
121+
# Check which one has user interactions
122+
older_has_users = has_user_interactions(older)
123+
newer_has_users = has_user_interactions(newer)
124+
125+
if older_has_users and newer_has_users:
126+
# Both have users, keep both
127+
logp(
128+
f"Both have users, keeping both: {older.id} and {newer.id}"
129+
)
130+
continue
131+
elif older_has_users:
132+
# Keep older, delete newer
133+
duplicates_to_delete.append((newer, older, distance))
134+
seen.add(newer.id)
135+
elif newer_has_users:
136+
# Keep newer, delete older
137+
duplicates_to_delete.append((older, newer, distance))
138+
seen.add(older.id)
139+
else:
140+
# Neither has users, keep newer (more likely to be better quality)
141+
duplicates_to_delete.append((older, newer, distance))
142+
seen.add(older.id)
143+
144+
logp(f"\nFound {len(duplicates_to_delete)} duplicates to delete")
145+
146+
# Report/delete duplicates
147+
deleted_count = 0
148+
for to_delete, to_keep, distance in duplicates_to_delete:
149+
logp(
150+
f"\n{'[DRY RUN] Would delete' if dry_run else 'Deleting'} article {to_delete.id}"
151+
)
152+
logp(f" Title: {to_delete.title[:80]}")
153+
logp(f" Published: {to_delete.published_time}")
154+
logp(f" Keeping article {to_keep.id} (distance: {distance})")
155+
156+
if not dry_run:
157+
db_session.delete(to_delete)
158+
deleted_count += 1
159+
160+
if not dry_run and deleted_count > 0:
161+
db_session.commit()
162+
logp(f"\n✅ Deleted {deleted_count} duplicate articles")
163+
elif dry_run:
164+
logp(f"\n[DRY RUN] Would delete {len(duplicates_to_delete)} articles")
165+
logp("Run with dry_run=False to actually delete")
166+
else:
167+
logp(f"\nNo duplicates found to delete")
168+
169+
170+
if __name__ == "__main__":
171+
import sys
172+
173+
# Parse command line arguments
174+
language_code = sys.argv[1] if len(sys.argv) > 1 else None
175+
dry_run = "--delete" not in sys.argv
176+
177+
if dry_run:
178+
logp("Running in DRY RUN mode. Add --delete flag to actually delete articles.")
179+
180+
find_and_delete_duplicates(
181+
language_code=language_code, days_back=10, distance_threshold=5, dry_run=dry_run
182+
)

0 commit comments

Comments
 (0)