Skip to content

Commit 67e0b9a

Browse files
committed
updated to latest version of wordstats
1 parent f6ac0e1 commit 67e0b9a

File tree

4 files changed

+129
-2
lines changed

4 files changed

+129
-2
lines changed

default_api.cfg

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,9 @@ SMTP_EMAIL = ''
1616

1717
INVITATION_CODES=['test']
1818

19-
SEND_NOTIFICATION_EMAILS=False
19+
SEND_NOTIFICATION_EMAILS=False
20+
21+
# Wordstats preloading
22+
# Set to True in production to preload all language dictionaries at startup
23+
# Set to False in development to use lazy loading (faster startup)
24+
PRELOAD_WORDSTATS=False

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ simhash
3434
sentry-sdk[flask]
3535
sortedcontainers
3636
SQLAlchemy>=2.0
37-
git+https://github.com/zeeguu/python-wordstats.git@master#egg=wordstats
37+
wordstats==1.1.0
3838
google-cloud-texttospeech==2.3.0
3939
timeago
4040
pymysql
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Migration script to recalculate ranks for ALL multi-word phrases.
5+
6+
This fixes phrases that were created before the July 31, 2025 fix
7+
and have incorrect ranks (using most frequent word instead of least frequent).
8+
"""
9+
10+
import sys
11+
import os
12+
13+
# Add the project root to Python path
14+
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15+
sys.path.insert(0, project_root)
16+
17+
from wordstats import Word
18+
from zeeguu.core.model import db
19+
from zeeguu.core.model.phrase import Phrase
20+
21+
# Initialize Flask app
22+
from zeeguu.api.app import create_app
23+
24+
app = create_app()
25+
26+
IMPOSSIBLE_RANK = 1000000
27+
28+
def recalculate_all_multiword_phrase_ranks():
29+
"""Recalculate ranks for ALL multi-word phrases."""
30+
31+
# Find all phrases with multiple words
32+
multiword_phrases = Phrase.query.filter(
33+
Phrase.content.like('% %'), # Contains at least one space
34+
).all()
35+
36+
print(f"Found {len(multiword_phrases)} multi-word phrases")
37+
38+
updated_count = 0
39+
unchanged_count = 0
40+
41+
for i, phrase in enumerate(multiword_phrases):
42+
if (i + 1) % 10000 == 0:
43+
print(f"Processed {i + 1}/{len(multiword_phrases)} phrases...")
44+
db.session.commit() # Commit in batches
45+
46+
words = phrase.content.split()
47+
if len(words) > 1:
48+
try:
49+
ranks = []
50+
for single_word in words:
51+
try:
52+
rank = Word.stats(single_word, phrase.language.code).rank
53+
if rank is not None:
54+
ranks.append(rank)
55+
except:
56+
# If we can't get rank for a word, treat it as very rare
57+
ranks.append(IMPOSSIBLE_RANK)
58+
59+
if ranks:
60+
# Take the highest rank (least frequent word)
61+
correct_rank = max(ranks)
62+
63+
if phrase.rank != correct_rank:
64+
old_rank = phrase.rank
65+
phrase.rank = correct_rank
66+
db.session.add(phrase)
67+
updated_count += 1
68+
69+
if updated_count <= 10: # Show first 10 changes
70+
print(f" Updated '{phrase.content}': {old_rank} -> {correct_rank}")
71+
else:
72+
unchanged_count += 1
73+
74+
except Exception as e:
75+
print(f"Error processing '{phrase.content}': {e}")
76+
continue
77+
78+
# Final commit
79+
try:
80+
db.session.commit()
81+
print(f"\nSummary:")
82+
print(f" Updated: {updated_count} phrases")
83+
print(f" Unchanged: {unchanged_count} phrases")
84+
print(f" Total processed: {len(multiword_phrases)} phrases")
85+
except Exception as e:
86+
db.session.rollback()
87+
print(f"Error committing changes: {e}")
88+
return False
89+
90+
return True
91+
92+
if __name__ == "__main__":
93+
print("Starting ALL multi-word phrase rank recalculation...")
94+
print("This will fix phrases created before the July 31, 2025 fix.\n")
95+
96+
with app.app_context():
97+
success = recalculate_all_multiword_phrase_ranks()
98+
if success:
99+
print("\nMigration completed successfully!")
100+
else:
101+
print("\nMigration failed!")
102+
sys.exit(1)

zeeguu/api/app.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,4 +97,24 @@ def serve_audio(filename):
9797
)
9898
warning("*** ==== ZEEGUU CORE: Linked model with: " + anon_conn_string)
9999

100+
# Preload wordstats in production for faster response times
101+
if app.config.get("PRELOAD_WORDSTATS", False):
102+
warning("*** Preloading wordstats dictionaries...")
103+
start_time = time.time()
104+
from wordstats import LanguageInfo
105+
106+
# Get all supported languages from the database
107+
from zeeguu.core.model import Language
108+
with app.app_context():
109+
all_languages = Language.all_languages()
110+
language_codes = [lang.code for lang in all_languages]
111+
112+
# Preload all language dictionaries
113+
LanguageInfo.load_in_memory_for(language_codes)
114+
115+
elapsed = time.time() - start_time
116+
warning(f"*** Wordstats preloaded {len(language_codes)} languages in {elapsed:.2f}s")
117+
else:
118+
warning("*** Wordstats will use lazy loading (PRELOAD_WORDSTATS=False)")
119+
100120
return app

0 commit comments

Comments
 (0)