Skip to content

Commit

Permalink
Initial stab at MongoDB in feed fetching and feed viewing. Still need…
Browse files Browse the repository at this point in the history
… to fix authors, tags, counts, aggregations, and any remaining bugs.
  • Loading branch information
samuelclay committed Aug 21, 2010
1 parent 7e40103 commit fa3be28
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 72 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ build/
*.mode1v3
**/*.perspectivev*
*.pbxuser
data/
2 changes: 1 addition & 1 deletion apps/reader/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserStory, Feature
from apps.reader.forms import SignupForm, LoginForm, FeatureForm
try:
from apps.rss_feeds.models import Feed, Story, FeedPage, DuplicateFeed
from apps.rss_feeds.models import Feed, Story, MStory, FeedPage, DuplicateFeed
except:
pass
from utils import json, urlnorm
Expand Down
93 changes: 58 additions & 35 deletions apps/rss_feeds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import hashlib
import random
import re
import mongoengine as mongo
from collections import defaultdict
from BeautifulSoup import BeautifulStoneSoup
from nltk.collocations import TrigramCollocationFinder, BigramCollocationFinder, TrigramAssocMeasures, BigramAssocMeasures
Expand Down Expand Up @@ -219,7 +220,7 @@ def update(self, force=False, feed=None, single_threaded=False):

return

def add_update_stories(self, stories, existing_stories):
def add_update_stories(self, stories, existing_stories, db):
ret_values = {
ENTRY_NEW:0,
ENTRY_UPDATED:0,
Expand All @@ -245,33 +246,35 @@ def add_update_stories(self, stories, existing_stories):
# pub_date = datetime.datetime.timetuple(story.get('published'))
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))

s = Story(story_feed = self,
s = MStory(story_feed_id = self.pk,
story_date = story.get('published'),
story_title = story.get('title'),
story_content = story_content,
story_author = story_author,
story_author_id = story_author.pk,
story_author_name = story.get('author'),
story_permalink = story.get('link'),
story_guid = story.get('guid') or story.get('id') or story.get('link'),
story_tags = self._shorten_story_tags(story_tags)
story_tags = self._shorten_and_encode_story_tags(story_tags),
# tags = story_tags
)
try:
s.save(force_insert=True)
s.save()
ret_values[ENTRY_NEW] += 1
cache.set('updated_feed:%s' % self.id, 1)
except IntegrityError:
ret_values[ENTRY_ERR] += 1
# print('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
[s.tags.add(tcat) for tcat in story_tags]
# for tcat in story_tags:
# Tag.objects.get_or_create(feed=self, tag=tcat)
elif existing_story and story_has_changed:
# update story
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))

original_content = None
if existing_story.story_original_content:
original_content = existing_story.story_original_content
if existing_story.get('story_original_content'):
original_content = existing_story.get('story_original_content')
else:
original_content = existing_story.story_content
original_content = existing_story.get('story_content')
# print 'Type: %s %s' % (type(original_content), type(story_content))
if len(story_content) > 10:
diff = HTMLDiff(unicode(original_content), story_content)
Expand All @@ -280,26 +283,25 @@ def add_update_stories(self, stories, existing_stories):
story_content_diff = original_content
# logging.debug("\t\tDiff: %s %s %s" % diff.getStats())
# logging.debug("\t\tDiff content: %s" % diff.getDiff())
if existing_story.story_title != story.get('title'):
if existing_story.get('story_title') != story.get('title'):
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
pass

s = Story(id = existing_story.id,
story_feed = self,
story_date = story.get('published'),
story_title = story.get('title'),
story_content = story_content_diff,
story_original_content = original_content,
story_author = story_author,
story_author_name = story.get('author'),
story_permalink = story.get('link'),
story_guid = story.get('guid') or story.get('id') or story.get('link'),
story_tags = self._shorten_story_tags(story_tags)
)
s.tags.clear()
[s.tags.add(tcat) for tcat in story_tags]
existing_story['story_feed'] = self.pk
existing_story['story_date'] = story.get('published')
existing_story['story_title'] = story.get('title')
existing_story['story_content'] = story_content_diff
existing_story['story_original_content'] = original_content
existing_story['story_author'] = story_author.pk
existing_story['story_author_name'] = story.get('author')
existing_story['story_permalink'] = story.get('link')
existing_story['story_guid'] = story.get('guid') or story.get('id') or story.get('link')
existing_story['story_tags'] = self._shorten_and_encode_story_tags(story_tags)
# existing_story['tags'] = story_tags
# s.tags.clear()
# [s.tags.add(tcat) for tcat in story_tags]
try:
s.save(force_update=True)
db.stories.update({'_id': existing_story['_id']}, existing_story)
ret_values[ENTRY_UPDATED] += 1
cache.set('updated_feed:%s' % self.id, 1)
except IntegrityError:
Expand Down Expand Up @@ -351,13 +353,13 @@ def save_popular_authors(self, feed_authors=None, lock=None):
if len(authors_list) > 1:
self.save_popular_authors(authors_list[:-1])

def _shorten_story_tags(self, story_tags):
def _shorten_and_encode_story_tags(self, story_tags):
encoded_tags = json.encode([t.name for t in story_tags])
if len(encoded_tags) < 2000:
return encoded_tags

if len(story_tags) > 1:
return self._shorten_story_tags(story_tags[:-1])
return self._shorten_and_encode_story_tags(story_tags[:-1])

def trim_feed(self):
from apps.reader.models import UserStory
Expand Down Expand Up @@ -393,7 +395,7 @@ def get_stories(self, offset=0, limit=25, force=False):
stories = None

if not stories or force:
stories_db = Story.objects.filter(story_feed=self)[offset:offset+limit]
stories_db = MStory.objects(story_feed_id=self.pk)[offset:offset+limit]
stories = self.format_stories(stories_db)
cache.set('feed_stories:%s-%s-%s' % (self.id, offset, limit), stories)

Expand Down Expand Up @@ -462,22 +464,22 @@ def _exists_story(self, story=None, story_content=None, existing_stories=None):
# print 'Story pub date: %s %s' % (story_published_now, story_pub_date)
if story_published_now or\
(story_pub_date > start_date and story_pub_date < end_date):
if story.get('guid') and story.get('guid') == existing_story.story_guid:
if story.get('guid') and story.get('guid') == existing_story['_id']:
story_in_system = existing_story
elif story.get('link') and story.get('link') == existing_story.story_permalink:
elif story.get('link') and story.get('link') == existing_story.get('story_permalink'):
story_in_system = existing_story

# import pdb
# pdb.set_trace()

# Title distance + content distance, checking if story changed
story_title_difference = levenshtein_distance(story.get('title'),
existing_story.story_title)
seq = difflib.SequenceMatcher(None, story_content, existing_story.story_content)
existing_story.get('story_title'))
seq = difflib.SequenceMatcher(None, story_content, existing_story.get('story_content'))

if (seq
and story_content
and existing_story.story_content
and existing_story.get('story_content')
and seq.real_quick_ratio() > .9
and seq.quick_ratio() > .95):
content_ratio = seq.ratio()
Expand All @@ -497,7 +499,7 @@ def _exists_story(self, story=None, story_content=None, existing_stories=None):
break

if story_in_system:
if story_content != existing_story.story_content:
if story_content != existing_story.get('story_content'):
story_has_changed = True
break

Expand Down Expand Up @@ -640,14 +642,35 @@ class Meta:
db_table="stories"
ordering=["-story_date"]
unique_together = (("story_feed", "story_guid_hash"),)

def save(self, *args, **kwargs):
if not self.story_guid_hash and self.story_guid:
self.story_guid_hash = hashlib.md5(self.story_guid).hexdigest()
if len(self.story_title) > 255:
self.story_title = self.story_title[:255]
super(Story, self).save(*args, **kwargs)

class MStory(mongo.Document):
'''A feed item'''
story_feed_id = mongo.IntField()
story_date = mongo.DateTimeField()
story_title = mongo.StringField(max_length=255)
story_content = mongo.StringField()
story_original_content = mongo.StringField()
story_content_type = mongo.StringField(max_length=255)
story_author_id = mongo.IntField()
story_author_name = mongo.StringField(max_length=100)
story_permalink = mongo.StringField()
story_guid = mongo.StringField(primary_key=True)
story_guid_hash = mongo.StringField(max_length=40)
story_tags = mongo.StringField(max_length=2000)
tags = mongo.ListField(mongo.StringField(max_length=100))

meta = {
'collection': 'stories',
'indexes': ['story_feed_id', 'story_date']
}

class FeedUpdateHistory(models.Model):
fetch_date = models.DateTimeField(default=datetime.datetime.now)
number_of_feeds = models.IntegerField()
Expand Down
22 changes: 10 additions & 12 deletions apps/rss_feeds/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from django.test.client import Client
from django.test import TestCase
from django.core import management
from apps.rss_feeds.models import Feed, Story
# from pprint import pprint
from apps.rss_feeds.models import Feed, MStory

class FeedTest(TestCase):
fixtures = ['rss_feeds.json']
Expand All @@ -17,19 +16,19 @@ def test_load_feeds__gawker(self):
management.call_command('loaddata', 'gawker1.json', verbosity=0)

feed = Feed.objects.get(feed_link__contains='gawker')
stories = Story.objects.filter(story_feed=feed)
stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 0)

management.call_command('refresh_feed', force=1, feed=1, single_threaded=True, daemonize=False)

stories = Story.objects.filter(story_feed=feed)
stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 38)

management.call_command('loaddata', 'gawker2.json', verbosity=0)
management.call_command('refresh_feed', force=1, feed=1, single_threaded=True, daemonize=False)

# Test: 1 changed char in content
stories = Story.objects.filter(story_feed=feed)
stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 38)

response = self.client.post('/reader/load_single_feed', { "feed_id": 1 })
Expand All @@ -40,14 +39,13 @@ def test_load_feeds__gothamist(self):
self.client.login(username='conesus', password='test')

management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0)

feed = Feed.objects.get(feed_link__contains='gothamist')
stories = Story.objects.filter(story_feed=feed)
stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 0)

management.call_command('refresh_feed', force=1, feed=4, single_threaded=True, daemonize=False)

stories = Story.objects.filter(story_feed=feed)
stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 42)

response = self.client.post('/reader/load_single_feed', { "feed_id": 4 })
Expand All @@ -57,7 +55,7 @@ def test_load_feeds__gothamist(self):
management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0)
management.call_command('refresh_feed', force=1, feed=4, single_threaded=True, daemonize=False)

stories = Story.objects.filter(story_feed=feed)
stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 42)

response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
Expand All @@ -72,18 +70,18 @@ def test_load_feeds__slashdot(self):
management.call_command('loaddata', 'slashdot1.json', verbosity=0)

feed = Feed.objects.get(feed_link__contains='slashdot')
stories = Story.objects.filter(story_feed=feed)
stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 0)

management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False)

stories = Story.objects.filter(story_feed=feed)
stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 38)

management.call_command('loaddata', 'slashdot2.json', verbosity=0)
management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False)

stories = Story.objects.filter(story_feed=feed)
stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 38)

response = self.client.post('/reader/load_single_feed', { "feed_id": 5 })
Expand Down
13 changes: 11 additions & 2 deletions settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import logging
import os
from mongoengine import connect

# ===========================
# = Directory Declaractions =
Expand Down Expand Up @@ -164,12 +165,14 @@

AUTH_PROFILE_MODULE = 'newsblur.UserProfile'
TEST_DATABASE_COLLATION = 'utf8_general_ci'
TEST_DATABASE_NAME = 'newsblur_test'
ROOT_URLCONF = 'urls'
INTERNAL_IPS = ('127.0.0.1',)
LOGGING_LOG_SQL = True
APPEND_SLASH = True
SOUTH_TESTS_MIGRATE = False
SESSION_ENGINE = "django.contrib.sessions.backends.cached_db"
TEST_RUNNER = "utils.testrunner.TestRunner"

# ===========
# = Logging =
Expand Down Expand Up @@ -197,7 +200,7 @@
'apps.profile',
'devserver',
'south',
'test_utils',
# 'test_utils',
'utils',
'utils.typogrify',
# 'debug_toolbar'
Expand Down Expand Up @@ -230,4 +233,10 @@ def custom_show_toolbar(request):
'INTERCEPT_REDIRECTS': True,
'SHOW_TOOLBAR_CALLBACK': custom_show_toolbar,
'HIDE_DJANGO_SQL': False,
}
}

# =========
# = Mongo =
# =========

connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])
Loading

0 comments on commit fa3be28

Please sign in to comment.