Huge piece of changes (you need to remove old backup and download again)

* Added timeout for url requests * Added moar error catching & reporting * Redesigned database to allow store & read recursive path keys * Multithreading download of media files (with download-threads option) * Media also stores metadata of items * Downloading of profile photos for all users * Downloading user photos & blog for your user * A number of small bugfixes
rabits · Oct 21, 2014 · 22b9fb9 · 22b9fb9
1 parent 6d63d67
commit 22b9fb9
Show file tree

Hide file tree

Showing 10 changed files with 330 additions and 107 deletions.
diff --git a/README.md b/README.md
@@ -16,16 +16,17 @@ You can use config ini file to store your configuration.
 
 Features:
 ---------
-* Download known users & friends
-* Download dialogs
-* Download chats
-* Download attachments media
+* Store known users & friends
+* Dialogs
+* Chats
+* Attachments media
+* Wall
+* Photos
 
 TODO:
 -----
-* Download wall
-* Download groups
-* Download photos, audio, video
+* Groups
+* Photo albums, audio, video
 * Advanced configuration
 
 Requirements:

diff --git a/lib/Api.py b/lib/Api.py
@@ -37,14 +37,18 @@ def request(method, params):
             params['access_token'] = _TOKEN
             params['v'] = '5.25'
             url = "https://api.vk.com/method/%s?%s" % (method, urlencode(params))
-            data = json.loads(urllib2.urlopen(url).read())
+            data = json.loads(urllib2.urlopen(url, None, 30).read())
             if 'response' not in data:
                 raise Exception('no correct response while calling api method "%s", data: %s' % (method, data))
             break
         except Exception as e:
             c.log('warning', 'Retry request %i (3): %s' % (retry, str(e)))
             time.sleep(2.0*(retry+1))
 
+    if 'response' not in data:
+        c.log('error', 'Unable to process request')
+        return None
+
     return data['response']
 
 def getUserId():

diff --git a/lib/Chats.py b/lib/Chats.py
@@ -24,6 +24,8 @@ def requestChatInfo(self, chat_id):
                 'log': []
             }
         data = Api.request('messages.getChat', {'chat_id': chat_id})
+        if data == None:
+            return
         if len(data['users']) > 0:
             Users.requestUsers([ str(u) for u in data['users'] ])
         self.data[chat_id]['data'] = data

diff --git a/lib/Common b/lib/Common
diff --git a/lib/Database.py b/lib/Database.py
@@ -20,24 +20,40 @@ def __init__(self):
 
         self.path = os.path.join(c.cfg('backup-dir'), self.__class__.__name__)
 
-        # Create directory
-        if not os.path.isdir(self.path):
-            os.makedirs(self.path)
-
         # Loading local data from the storage
         self.load()
 
     def store(self):
         c.log('debug', 'Store %s (%i)' % (self.__class__.__name__, len(self.data)))
         for i in self.data:
-            with codecs.open(os.path.join(self.path, i + '.json'), 'w', 'utf-8') as outfile:
+            path = os.path.join(self.path, i + '.json')
+            if not os.path.isdir(os.path.dirname(path)):
+                os.makedirs(os.path.dirname(path))
+            with codecs.open(path, 'w', 'utf-8') as outfile:
                 json.dump(self.data[i], outfile, indent=1, ensure_ascii=False)
 
-    def load(self):
-        files = [ f for f in os.listdir(self.path) if f.endswith('.json') ]
-        c.log('debug', 'Loading %s (%i)' % (self.__class__.__name__, len(files)))
+    def load(self, subdir = None):
+        path = self.path if subdir == None else os.path.join(self.path, subdir)
+
+        if not os.path.isdir(path):
+            c.log('debug', 'DB directory "%s" not found' % path)
+            return
+
+        listdir = os.listdir(path)
+        dirs = [ d for d in listdir if d != 'storage' and os.path.isdir(os.path.join(path, d)) ]
+
+        for d in dirs:
+            if subdir == None:
+                self.load(d)
+            else:
+                self.load(os.path.join(subdir, d))
+
+        files = [ f for f in listdir if f.endswith('.json') ]
+        c.log('debug', 'Loading files %s %s (%i)' % (self.__class__.__name__, path, len(files)))
+
         for f in files:
-            filename = os.path.join(self.path, f)
+            filename = os.path.join(path, f)
+            data_path = os.path.splitext(f)[0] if subdir == None else os.path.join(subdir, os.path.splitext(f)[0])
             data = json.load(open(filename))
-            self.data[data['id']] = data
+            self.data[data_path] = data
 
diff --git a/lib/Dialogs.py b/lib/Dialogs.py
@@ -25,6 +25,8 @@ def requestDialogs(self):
 
         while True:
             data = Api.request('messages.getDialogs', req_data)
+            if data == None:
+                return
             count = data['count']
             data = data['items']
             for d in data:

diff --git a/lib/Media.py b/lib/Media.py
@@ -10,111 +10,224 @@
 
 import time, urllib2, os
 from urlparse import urlparse
+import threading
+from Queue import Queue
 
 import Common as c
 
-class Media:
+from Database import Database
+
+class Media(Database):
+    class Downloader(threading.Thread):
+        def __init__(self, queue, report):
+            threading.Thread.__init__(self)
+            self.queue = queue
+            self.report = report
+            self.waiting = True
+            self._stop = threading.Event()
+
+        def run(self):
+            c.log('debug', 'Downloader thread started')
+            while not self._stop.isSet():
+                if not self.queue.empty():
+                    self.waiting = False
+                    url = self.queue.get()
+                    response = url.download()
+                    if response == False and url.tried < 3:
+                        self.queue.put(url)
+                    elif response == False and url.tried == 3:
+                        self.report['failure'].append(url)
+                    elif response == True:
+                        self.report['success'].append(url)
+                    self.queue.task_done()
+                else:
+                    self.waiting = True
+                    time.sleep(2)
+            c.log('debug', 'Downloader thread stopped')
+
+        def stop(self):
+            self._stop.set()
+
+    class Download(object):
+        def __init__(self, url, destination):
+            self.url = url
+            self.destination = destination
+            self.tried = 0
+            self.success = False
+            self.error = None
+
+        def download(self):
+            if self.tried > 0:
+                time.sleep(self.tried * 2)
+            self.tried += 1
+            try:
+                directory = os.path.dirname(self.destination)
+                if not os.path.isdir(directory):
+                    os.makedirs(directory)
+
+                u = urllib2.urlopen(self.url, None, 30)
+                with open(self.destination, 'wb') as outfile:
+                    # TODO: limit by size
+                    size = int(u.info().getheaders('Content-Length')[0])
+                    while True:
+                        b = u.read(8192)
+                        if not b:
+                            break
+                        outfile.write(b)
+
+                self.success = True
+
+            except Exception as e:
+                self.error = e
+
+            return self.success
+
+    def stopDownloads(self):
+        c.log('debug', 'Stopping download threads')
+        for i in self.threads:
+            i.stop()
+
     def __init__(self):
-        c.log('debug', 'Init Media')
+        Database.__init__(self)
+
+        self.total_downloads = 0
+        self.queue = Queue(0)
+        self.report = {'success':[], 'failure':[]}
+        self.threads = []
+
+        for i in range(c.cfg('download-threads')):
+            thread = self.Downloader(self.queue, self.report)
+            thread.start()
+            self.threads.append(thread)
+        if self.queue.qsize() > 0:
+            self.queue.join()
+
+    def store(self):
+        c.log('info', 'Waiting downloads complete: ~%i...' % self.queue.qsize())
+        while not self.queue.empty():
+            c.log('info', '[%s] %i left' % (''.join([str(int(not t.waiting)) for t in self.threads]), self.queue.qsize()))
+            time.sleep(5)
 
-        self.path = os.path.join(c.cfg('backup-dir'), 'media')
+        self.stopDownloads()
 
-        # Create directory
-        if not os.path.isdir(self.path):
-            os.makedirs(self.path)
+        c.log('info', 'Downloaded %i of %i' % (len(self.report['success']), self.total_downloads))
+        if len(self.report['failure']) > 0:
+            c.log('warning', '  failed: %i' % len(self.report['failure']))
+            for url in self.report['failure']:
+                c.log('debug', '    %s' % url.url)
+
+        Database.store(self)
 
     def loadAttachments(self, data):
         attachments = []
         if 'attachments' in data:
             attachments.extend(data['attachments'])
         if 'attachment' in data:
             attachments.append(data['attachment'])
+        if 'copy_history' in data:
+            self.loadAttachments(data['copy_history'])
         for attach in attachments:
             c.log('debug', 'Processing %s' % attach['type'])
-            funcname = "process" + attach['type'].title()
+            funcname = 'process' + attach['type'].title()
             if funcname in dir(self):
                 getattr(self, funcname)(attach[attach['type']])
             else:
                 c.log('error', '  unable to find attachment processing function "Media.%s"' % funcname)
                 c.log('debug', str(attach))
 
-    def download(self, url, path = None):
+    def addDownload(self, url, path = None):
         if url == '':
             c.log('warning', 'Skipping empty url')
             return path
 
         if path == None:
-            path = self.path + urlparse(url).path
+            path = os.path.join(self.path, 'storage') + urlparse(url).path
 
         if os.path.isfile(path):
             c.log('debug', 'Skipping, file %s already exists' % path)
             return path
 
-        directory = os.path.dirname(path)
-        if not os.path.isdir(directory):
-            os.makedirs(directory)
-
-        for retry in xrange(3):
-            try:
-                u = urllib2.urlopen(url)
-                with open(path, "wb") as outfile:
-                    size = int(u.info().getheaders("Content-Length")[0])
-                    c.log('debug', 'Downloading %s to %s (%ib)' % (url, path, size))
-                    size_dl = 0
-                    while True:
-                        b = u.read(8192)
-                        if not b:
-                            break
-                        size_dl += len(b)
-                        outfile.write(b)
-                break
-            except Exception as e:
-                c.log('warning', 'Retry request %i (3): %s' % (retry, str(e)))
-                time.sleep(2.0*(retry+1))
+        c.log('debug', 'Adding media to queue "%s"' % url)
+        self.total_downloads += 1
+        self.queue.put(self.Download(url, path))
 
         return path
 
-    def processPhoto(self, data):
-        url = None
-        size = 0
-        for key in data.keys():
-            if key.startswith('photo_') and int(key.split('_')[1]) > size:
-                size = int(key.split('_')[1])
-                url = data.pop(key, None)
+    def preprocess(self, data, data_type):
+        # TODO: limit by type
+        mydata = data.copy()
+        data.clear()
+        data['id'] = mydata['id']
+        if 'owner_id' in mydata:
+            path = os.path.join(data_type, str(mydata['owner_id']), str(mydata['id']))
+            data['owner_id'] = mydata['owner_id']
+        else:
+            path = os.path.join(data_type, str(mydata['id']))
+
+        if path in self.data:
+            return None
+        self.data[path] = mydata
 
-        if url == None:
-            c.log('warning', 'Valid url not found in %s' % str(data))
-            return
+        return path
 
-        data['url'] = url
-        data['localpath'] = self.download(data['url'])
+    def processPhoto(self, data):
+        path = self.preprocess(data, 'photo')
+        if path != None:
+            url = None
+            size = 0
+            for key in self.data[path].keys():
+                if key.startswith('photo_'):
+                    if int(key.split('_')[1]) > size:
+                        size = int(key.split('_')[1])
+                        url = self.data[path].pop(key, None)
+                    self.data[path].pop(key, None)
+
+            if url == None:
+                c.log('warning', 'Valid url not found in %s' % str(self.data[path]))
+                return
+
+            self.data[path]['url'] = url
+            self.data[path]['localpath'] = self.addDownload(self.data[path]['url'])
 
     def processDoc(self, data):
-        data['localpath'] = self.download(data['url'])
+        path = self.preprocess(data, 'doc')
+        if path != None:
+            self.data[path]['localpath'] = self.addDownload(self.data[path]['url'])
 
     def processAudio(self, data):
-        data['localpath'] = self.download(data['url'])
+        path = self.preprocess(data, 'audio')
+        if path != None:
+            self.data[path]['localpath'] = self.addDownload(self.data[path]['url'])
 
     def processWall(self, data):
         c.log('debug', 'Processing wall attachments')
         self.loadAttachments(data)
 
     def processGeo(self, data):
+        self.preprocess(data, 'geo')
         c.log('debug', 'Skipping geo attachment - no data to download')
 
     def processVideo(self, data):
+        self.preprocess(data, 'video')
         c.log('debug', 'Skipping video attachment - size of the file is too big')
 
     def processSticker(self, data):
+        self.preprocess(data, 'sticker')
         c.log('debug', 'Skipping sticker attachment - idiotizm')
 
     def processLink(self, data):
         c.log('debug', 'Skipping link attachment - no data to download')
 
     def processPoll(self, data):
+        self.preprocess(data, 'poll')
         c.log('debug', 'Skipping poll attachment - no data to download')
 
     def processNote(self, data):
-        c.log('debug', 'Skipping poll attachment - no data to download')
+        self.preprocess(data, 'note')
+        c.log('debug', 'Skipping note attachment - no data to download')
+
+    def processPresent(self, data):
+        self.preprocess(data, 'present')
+        c.log('debug', 'Skipping present attachment - stupid present')
 
 S = Media()
diff --git a/lib/Messages.py b/lib/Messages.py
@@ -24,6 +24,8 @@ def requestMessages(request, msgs_data):
 
     while True:
         data = Api.request('messages.getHistory', request)
+        if data == None:
+            return
         count = data['count']
         data = data['items']