Skip to content

Commit

Permalink
Fixing feed fetcher to use updated date if the feed doesn't provide i…
Browse files Browse the repository at this point in the history
…t otherwise.
  • Loading branch information
samuelclay committed Dec 24, 2012
1 parent 4741813 commit 63bd362
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 66 deletions.
130 changes: 70 additions & 60 deletions apps/rss_feeds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,11 @@ def get_by_id(cls, feed_id, feed_address=None):
def add_update_stories(self, stories, existing_stories, verbose=False):
ret_values = dict(new=0, updated=0, same=0, error=0)

if settings.DEBUG:
logging.debug(" ---> Checking %s new/updated against %s stories" % (
len(stories),
len(existing_stories)))

for story in stories:
if not story.get('title'):
continue
Expand All @@ -750,6 +755,9 @@ def add_update_stories(self, stories, existing_stories, verbose=False):

existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
if existing_story is None:
if settings.DEBUG:
logging.debug('- New story in feed (%s - %s): %s' % (self.feed_title, story.get('title'), len(story_content)))

s = MStory(story_feed_id = self.pk,
story_date = story.get('published'),
story_title = story.get('title'),
Expand All @@ -764,12 +772,10 @@ def add_update_stories(self, stories, existing_stories, verbose=False):
ret_values['new'] += 1
except (IntegrityError, OperationError):
ret_values['error'] += 1
if verbose:
if settings.DEBUG:
logging.info(' ---> [%-30s] ~SN~FRIntegrityError on new story: %s' % (self.feed_title[:30], story.get('guid')[:30]))
elif existing_story and story_has_changed:
# update story
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))

original_content = None
try:
if existing_story and existing_story.id:
Expand Down Expand Up @@ -803,11 +809,11 @@ def add_update_stories(self, stories, existing_stories, verbose=False):
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
if existing_story.story_guid != story.get('guid'):
self.update_read_stories_with_new_guid(existing_story.story_guid, story.get('guid'))

if settings.DEBUG:
logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(story_content_diff), len(story_content)))

existing_story.story_feed = self.pk
# Do not allow publishers to change the story date once a story is published.
# Leads to incorrect unread story counts.
# existing_story.story_date = story.get('published')
existing_story.story_title = story.get('title')
existing_story.story_content = story_content_diff
existing_story.story_latest_content = story_content
Expand All @@ -816,6 +822,10 @@ def add_update_stories(self, stories, existing_stories, verbose=False):
existing_story.story_permalink = story_link
existing_story.story_guid = story.get('guid')
existing_story.story_tags = story_tags
# Do not allow publishers to change the story date once a story is published.
# Leads to incorrect unread story counts.
# existing_story.story_date = story.get('published') # No, don't

try:
existing_story.save()
ret_values['updated'] += 1
Expand Down Expand Up @@ -1052,68 +1062,68 @@ def get_permalink(self, entry):
def _exists_story(self, story=None, story_content=None, existing_stories=None):
story_in_system = None
story_has_changed = False
story_pub_date = story.get('published')
story_published_now = story.get('published_now', False)
story_link = self.get_permalink(story)
start_date = story_pub_date - datetime.timedelta(hours=8)
end_date = story_pub_date + datetime.timedelta(hours=8)

# story_pub_date = story.get('published')
# story_published_now = story.get('published_now', False)
# start_date = story_pub_date - datetime.timedelta(hours=8)
# end_date = story_pub_date + datetime.timedelta(hours=8)

for existing_story in existing_stories:
content_ratio = 0
existing_story_pub_date = existing_story.story_date
# existing_story_pub_date = existing_story.story_date
# print 'Story pub date: %s %s' % (story_published_now, story_pub_date)
if (story_published_now or
(existing_story_pub_date > start_date and existing_story_pub_date < end_date)):

if 'story_latest_content_z' in existing_story:
existing_story_content = unicode(zlib.decompress(existing_story.story_latest_content_z))
elif 'story_latest_content' in existing_story:
existing_story_content = existing_story.story_latest_content
elif 'story_content_z' in existing_story:
existing_story_content = unicode(zlib.decompress(existing_story.story_content_z))
elif 'story_content' in existing_story:
existing_story_content = existing_story.story_content
else:
existing_story_content = u''

if isinstance(existing_story.id, unicode):
existing_story.story_guid = existing_story.id
if story.get('guid') and story.get('guid') == existing_story.story_guid:
story_in_system = existing_story

# Title distance + content distance, checking if story changed
story_title_difference = abs(levenshtein_distance(story.get('title'),
existing_story.story_title))

seq = difflib.SequenceMatcher(None, story_content, existing_story_content)

if 'story_latest_content_z' in existing_story:
existing_story_content = unicode(zlib.decompress(existing_story.story_latest_content_z))
elif 'story_latest_content' in existing_story:
existing_story_content = existing_story.story_latest_content
elif 'story_content_z' in existing_story:
existing_story_content = unicode(zlib.decompress(existing_story.story_content_z))
elif 'story_content' in existing_story:
existing_story_content = existing_story.story_content
else:
existing_story_content = u''

if (seq
and story_content
and existing_story_content
and seq.real_quick_ratio() > .9
and seq.quick_ratio() > .95):
content_ratio = seq.ratio()

if story_title_difference > 0 and content_ratio > .98:
story_in_system = existing_story
if story_title_difference > 0 or content_ratio < 1.0:
# print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
story_has_changed = True
break
if isinstance(existing_story.id, unicode):
existing_story.story_guid = existing_story.id
if story.get('guid') and story.get('guid') == existing_story.story_guid:
story_in_system = existing_story

# Title distance + content distance, checking if story changed
story_title_difference = abs(levenshtein_distance(story.get('title'),
existing_story.story_title))

seq = difflib.SequenceMatcher(None, story_content, existing_story_content)

if (seq
and story_content
and existing_story_content
and seq.real_quick_ratio() > .9
and seq.quick_ratio() > .95):
content_ratio = seq.ratio()

# More restrictive content distance, still no story match
if not story_in_system and content_ratio > .98:
# print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
story_in_system = existing_story
if story_title_difference > 0 and content_ratio > .98:
story_in_system = existing_story
if story_title_difference > 0 or content_ratio < 1.0:
# print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
story_has_changed = True
break

if story_in_system and not story_has_changed:
if story_content != existing_story_content:
story_has_changed = True
if story_link != existing_story.story_permalink:
story_has_changed = True
break

# More restrictive content distance, still no story match
if not story_in_system and content_ratio > .98:
# print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
story_in_system = existing_story
story_has_changed = True
break

if story_in_system and not story_has_changed:
if story_content != existing_story_content:
story_has_changed = True
if story_link != existing_story.story_permalink:
story_has_changed = True
# if story_pub_date != existing_story.story_date:
# story_has_changed = True
break


# if story_has_changed or not story_in_system:
Expand Down
15 changes: 10 additions & 5 deletions media/js/newsblur/reader/reader_add_feed.js
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ _.extend(NEWSBLUR.ReaderAddFeed.prototype, {
$loading.addClass('NB-active');
$submit.addClass('NB-disabled').val('Adding...');

this.model.save_add_url(url, folder, $.rescope(this.post_save_add_url, this));
this.model.save_add_url(url, folder, $.rescope(this.post_save_add_url, this), $.rescope(this.error, this));
},

post_save_add_url: function(e, data) {
Expand All @@ -250,13 +250,18 @@ _.extend(NEWSBLUR.ReaderAddFeed.prototype, {
this.model.preference('has_setup_feeds', true);
NEWSBLUR.reader.check_hide_getting_started();
} else {
var $error = $('.NB-error', '.NB-fieldset.NB-add-add-url');
$error.text(data.message);
$error.slideDown(300);
$submit.val('Add Site');
this.error(data);
}
},

error: function(data) {
var $submit = $('.NB-add-add-url input[type=submit]', this.$modal);
var $error = $('.NB-error', '.NB-fieldset.NB-add-add-url');
$error.text(data.message || "Oh no, there was a problem grabbing that URL and there's no good explanation for what happened.");
$error.slideDown(300);
$submit.val('Add Site');
},

save_add_folder: function() {
var $submit = $('.NB-add-add-folder input[type=submit]', this.$modal);
var $error = $('.NB-error', '.NB-fieldset.NB-add-add-folder');
Expand Down
2 changes: 1 addition & 1 deletion utils/story_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def _extract_date_tuples(date):
return parsed_date, date_tuple, today_tuple, yesterday_tuple

def pre_process_story(entry):
publish_date = entry.get('published_parsed')
publish_date = entry.get('published_parsed') or entry.get('updated_parsed')
if publish_date:
publish_date = datetime.datetime(*publish_date[:6])
if not publish_date and entry.get('published'):
Expand Down

0 comments on commit 63bd362

Please sign in to comment.