From bf7eff4f882b2a370a29db9a52d09ed40a67e871 Mon Sep 17 00:00:00 2001 From: Jannis Leidel Date: Fri, 21 Sep 2007 19:31:08 +0000 Subject: [PATCH] initial checkin with a solid base of functions git-svn-id: https://django-robots.googlecode.com/svn/trunk@2 12edf5ea-513a-0410-8a8c-37067077e60f committer: leidel --HG-- extra : convert_revision : 92a38069a2c4e833d364ac44e4a7f58af577a9ac --- INSTALL.txt | 15 +++++ LICENSE.txt | 28 ++++++++ MANIFEST.in | 5 ++ README.txt | 90 ++++++++++++++++++++++++++ robots/__init__.py | 0 robots/models.py | 57 ++++++++++++++++ robots/templates/robots/rule_list.html | 7 ++ robots/urls.py | 17 +++++ setup.py | 18 ++++++ 9 files changed, 237 insertions(+) create mode 100644 INSTALL.txt create mode 100644 LICENSE.txt create mode 100644 MANIFEST.in create mode 100644 README.txt create mode 100644 robots/__init__.py create mode 100644 robots/models.py create mode 100644 robots/templates/robots/rule_list.html create mode 100644 robots/urls.py create mode 100644 setup.py diff --git a/INSTALL.txt b/INSTALL.txt new file mode 100644 index 0000000..ecf0b30 --- /dev/null +++ b/INSTALL.txt @@ -0,0 +1,15 @@ +To install it, run the following command inside this directory: + + python setup.py install + +Or if you'd prefer you can simply place the included ``dbtemplates`` +directory somewhere on your Python path, or symlink to it from +somewhere on your Python path; this is useful if you're working from a +Subversion checkout. + +Note that this application requires Python 2.3 or later, and a recent +Subversion checkout of Django. You can obtain Python from +http://www.python.org/ and Django from http://www.djangoproject.com/. + +This install notice was bluntly stolen from James Bennett's registration +package, http://code.google.com/p/django-registration/ \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..75fd133 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,28 @@ +Copyright (c) 2007, Jannis Leidel +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the author nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5658b56 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include INSTALL.txt +include LICENSE.txt +include MANIFEST.in +include README.txt +recursive-include robots/templates * \ No newline at end of file diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..c04b12f --- /dev/null +++ b/README.txt @@ -0,0 +1,90 @@ +======================================= +Robots exclusion application for Django +======================================= + +This is a basic Django application to manage robots.txt files following the +`robots exclusion standard`_, complementing the Sitemap contrib application. + +.. _robots exclusion standard: http://www.robotstxt.org/ + +How to use it in your own django application +============================================ + +0. Get the source from the application site at: + + http://code.google.com/p/django-robots/ + +1. Follow the instructions in the INSTALL.txt file + +2. Edit the settings.py of your Django project: + + # Add ``robots`` to the ``INSTALLED_APPS`` of your django project + + # Check if ``django.contrib.sites`` and ``django.contrib.admin`` are in + ``INSTALLED_APPS`` and add if necessary + + It should look something like this: + + INSTALLED_APPS = ( + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.sites', + 'django.contrib.admin', + 'django.contrib.flatpages', + 'robots', + 'myproject.myapp', + ) + + # Check if ``django.template.loaders.app_directories.load_template_source`` + is in the ``TEMPLATE_LOADERS`` list. + + It should look something like this: + + TEMPLATE_LOADERS = ( + 'django.template.loaders.filesystem.load_template_source', + 'django.template.loaders.app_directories.load_template_source', + ) + +3. Add this line to your site's root URLConf:: + + (r'^robots.txt$', include('robots.urls')), + +4. Sync your database via shell (``manage.py syncdb`` within project dir). + +5. Create Rule and URL objects in the admin interface. See ``usage`` + +6. Go to /robots.txt under the URL of your Django site to see the results. + +Usage +===== + +The application consists of two database models which are tied together with a +m2m relationship:: + + 1. ``Rule`` - contains a User Agent field and multiple URL patters to + define a abstract disallowance rule. Create one and add one or more URL + pattern to it. + + Please have a look at the `database of web robots`_ for a full list of + existing web robots user agent strings. + + 2. ``Url`` - defines a case-sensitive and exact URL pattern which is used + together with an user agent string to disallow the access for web robots. + +You can set ``SITEMAP_URL`` in your projects settings.py file to the URL of your +sitemap.xml file, for example: + + SITEMAP_URL = "http://www.djangoproject.com/sitemap.xml" + +This is added to the resulting robots.txt file as a "Sitemap:" statement. + +.. _database of web robots: http://www.robotstxt.org/wc/active/html/index.html + +Bugs, support, questions and headaches +====================================== + +Please leave your `questions and problems`_ on the `designated Google Code site`_: + +.. _designated Google Code site: http://code.google.com/p/django-robots/ +.. _questions and problems: http://code.google.com/p/django-robots/issues/list diff --git a/robots/__init__.py b/robots/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/robots/models.py b/robots/models.py new file mode 100644 index 0000000..8a343e2 --- /dev/null +++ b/robots/models.py @@ -0,0 +1,57 @@ +from django.db import models +from django.contrib.admin.views.doc import simplify_regex +from django.contrib.sites.models import Site +from django.utils.translation import ugettext_lazy as _ +from django.conf import settings + +def installed_url_patterns(): + """ + Helper function to return URL patterns of the installed applications + """ + paths = [] + urlconf = __import__(settings.ROOT_URLCONF, {}, {}, ['']) + for pattern in urlconf.urlpatterns: + url_pattern = simplify_regex(pattern.regex.pattern) + if not url_pattern.endswith("robots.txt"): + paths.append(url_pattern) + return " ".join(paths) + +class Url(models.Model): + """ + Defines a URL pattern which should not be allowed to be accessed by a web + robot. It's case-sensitive and exact, e.g. "/admin" and "/admin/" are + different URLs. + """ + pattern = models.CharField(_('pattern'), max_length=255, core=True, + help_text=_('This is case-sensitive! Installed apps: %(patterns)s') % {'patterns': installed_url_patterns()}) + class Meta: + verbose_name = _('url') + verbose_name_plural = _('url') + class Admin: + pass + def __unicode__(self): + return u"%s" % self.pattern + def save(self): + if not self.pattern.startswith('/'): + self.pattern = '/' + self.pattern + super(Url, self).save() + +class Rule(models.Model): + """ + Defines a abstract rule which should be added to the virtual robots.txt + file, disallowing the user agent to access the given URLs. It uses the + Site contrib application to enable multiple robots.txt files. + """ + user_agent = models.CharField(_('user agent'), max_length=255, + help_text=_("This should be a user agent string like 'Googlebot'. For a full list look at the database of Web Robots. Enter '*' for matching all user agents.")) + urls = models.ManyToManyField(Url, help_text="These are URLs which are not allowed to be accessed by web robots.") + sites = models.ManyToManyField(Site) + class Meta: + verbose_name = _('rule') + verbose_name_plural = _('rules') + ordering = ('-user_agent',) + class Admin: + list_filter = ('sites',) + search_fields = ('user_agent',) + def __unicode__(self): + return u"%s" % self.user_agent diff --git a/robots/templates/robots/rule_list.html b/robots/templates/robots/rule_list.html new file mode 100644 index 0000000..f5069d5 --- /dev/null +++ b/robots/templates/robots/rule_list.html @@ -0,0 +1,7 @@ +{% if object_list %}{% for rule in object_list %}User-agent: {{ rule.user_agent }} +{% for url in rule.urls.all %}Disallow: {{ url.pattern }} +{% endfor %} +{% endfor %}{% else %}User-agent: * +Disallow: +{% endif %}{% if sitemap_url %}Sitemap: {{ sitemap_url }} +{% endif %} diff --git a/robots/urls.py b/robots/urls.py new file mode 100644 index 0000000..17dec8a --- /dev/null +++ b/robots/urls.py @@ -0,0 +1,17 @@ +from django.conf.urls.defaults import * +from django.contrib.sites.models import Site +from django.conf import settings + +current_site = Site.objects.get_current() + +options = { + 'queryset': current_site.rule_set.all(), + 'allow_empty': True, + 'extra_context': { + 'sitemap_url': getattr(settings, "SITEMAP_URL", False), + } +} + +urlpatterns = patterns('django.views.generic.list_detail', + (r'^$', 'object_list', options), +) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ec23f41 --- /dev/null +++ b/setup.py @@ -0,0 +1,18 @@ +from distutils.core import setup + +setup(name='robots', + version='0.1', + description='Robots exclusion application for Django, complementing Sitemaps.', + author='Jannis Leidel', + author_email='jannis@leidel.info', + url='http://code.google.com/p/django-robots/', + packages=['robots'], + package_dir={ 'robots': 'robots' }, + classifiers=['Development Status :: 4 - Beta', + 'Environment :: Web Environment', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Utilities'], + ) \ No newline at end of file