Skip to content

Commit

Permalink
Huge piece of changes (you need to remove old backup and download again)
Browse files Browse the repository at this point in the history
* Added timeout for url requests
* Added moar error catching & reporting
* Redesigned database to allow store & read recursive path keys
* Multithreading download of media files (with download-threads option)
* Media also stores metadata of items
* Downloading of profile photos for all users
* Downloading user photos & blog for your user
* A number of small bugfixes
  • Loading branch information
rabits committed Oct 21, 2014
1 parent 6d63d67 commit 22b9fb9
Show file tree
Hide file tree
Showing 10 changed files with 330 additions and 107 deletions.
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@ You can use config ini file to store your configuration.

Features:
---------
* Download known users & friends
* Download dialogs
* Download chats
* Download attachments media
* Store known users & friends
* Dialogs
* Chats
* Attachments media
* Wall
* Photos

TODO:
-----
* Download wall
* Download groups
* Download photos, audio, video
* Groups
* Photo albums, audio, video
* Advanced configuration

Requirements:
Expand Down
6 changes: 5 additions & 1 deletion lib/Api.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,18 @@ def request(method, params):
params['access_token'] = _TOKEN
params['v'] = '5.25'
url = "https://api.vk.com/method/%s?%s" % (method, urlencode(params))
data = json.loads(urllib2.urlopen(url).read())
data = json.loads(urllib2.urlopen(url, None, 30).read())
if 'response' not in data:
raise Exception('no correct response while calling api method "%s", data: %s' % (method, data))
break
except Exception as e:
c.log('warning', 'Retry request %i (3): %s' % (retry, str(e)))
time.sleep(2.0*(retry+1))

if 'response' not in data:
c.log('error', 'Unable to process request')
return None

return data['response']

def getUserId():
Expand Down
2 changes: 2 additions & 0 deletions lib/Chats.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def requestChatInfo(self, chat_id):
'log': []
}
data = Api.request('messages.getChat', {'chat_id': chat_id})
if data == None:
return
if len(data['users']) > 0:
Users.requestUsers([ str(u) for u in data['users'] ])
self.data[chat_id]['data'] = data
Expand Down
2 changes: 1 addition & 1 deletion lib/Common
Submodule Common updated 1 files
+2 −1 Common.py
36 changes: 26 additions & 10 deletions lib/Database.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,40 @@ def __init__(self):

self.path = os.path.join(c.cfg('backup-dir'), self.__class__.__name__)

# Create directory
if not os.path.isdir(self.path):
os.makedirs(self.path)

# Loading local data from the storage
self.load()

def store(self):
c.log('debug', 'Store %s (%i)' % (self.__class__.__name__, len(self.data)))
for i in self.data:
with codecs.open(os.path.join(self.path, i + '.json'), 'w', 'utf-8') as outfile:
path = os.path.join(self.path, i + '.json')
if not os.path.isdir(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
with codecs.open(path, 'w', 'utf-8') as outfile:
json.dump(self.data[i], outfile, indent=1, ensure_ascii=False)

def load(self):
files = [ f for f in os.listdir(self.path) if f.endswith('.json') ]
c.log('debug', 'Loading %s (%i)' % (self.__class__.__name__, len(files)))
def load(self, subdir = None):
path = self.path if subdir == None else os.path.join(self.path, subdir)

if not os.path.isdir(path):
c.log('debug', 'DB directory "%s" not found' % path)
return

listdir = os.listdir(path)
dirs = [ d for d in listdir if d != 'storage' and os.path.isdir(os.path.join(path, d)) ]

for d in dirs:
if subdir == None:
self.load(d)
else:
self.load(os.path.join(subdir, d))

files = [ f for f in listdir if f.endswith('.json') ]
c.log('debug', 'Loading files %s %s (%i)' % (self.__class__.__name__, path, len(files)))

for f in files:
filename = os.path.join(self.path, f)
filename = os.path.join(path, f)
data_path = os.path.splitext(f)[0] if subdir == None else os.path.join(subdir, os.path.splitext(f)[0])
data = json.load(open(filename))
self.data[data['id']] = data
self.data[data_path] = data

2 changes: 2 additions & 0 deletions lib/Dialogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def requestDialogs(self):

while True:
data = Api.request('messages.getDialogs', req_data)
if data == None:
return
count = data['count']
data = data['items']
for d in data:
Expand Down
203 changes: 158 additions & 45 deletions lib/Media.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,111 +10,224 @@

import time, urllib2, os
from urlparse import urlparse
import threading
from Queue import Queue

import Common as c

class Media:
from Database import Database

class Media(Database):
class Downloader(threading.Thread):
def __init__(self, queue, report):
threading.Thread.__init__(self)
self.queue = queue
self.report = report
self.waiting = True
self._stop = threading.Event()

def run(self):
c.log('debug', 'Downloader thread started')
while not self._stop.isSet():
if not self.queue.empty():
self.waiting = False
url = self.queue.get()
response = url.download()
if response == False and url.tried < 3:
self.queue.put(url)
elif response == False and url.tried == 3:
self.report['failure'].append(url)
elif response == True:
self.report['success'].append(url)
self.queue.task_done()
else:
self.waiting = True
time.sleep(2)
c.log('debug', 'Downloader thread stopped')

def stop(self):
self._stop.set()

class Download(object):
def __init__(self, url, destination):
self.url = url
self.destination = destination
self.tried = 0
self.success = False
self.error = None

def download(self):
if self.tried > 0:
time.sleep(self.tried * 2)
self.tried += 1
try:
directory = os.path.dirname(self.destination)
if not os.path.isdir(directory):
os.makedirs(directory)

u = urllib2.urlopen(self.url, None, 30)
with open(self.destination, 'wb') as outfile:
# TODO: limit by size
size = int(u.info().getheaders('Content-Length')[0])
while True:
b = u.read(8192)
if not b:
break
outfile.write(b)

self.success = True

except Exception as e:
self.error = e

return self.success

def stopDownloads(self):
c.log('debug', 'Stopping download threads')
for i in self.threads:
i.stop()

def __init__(self):
c.log('debug', 'Init Media')
Database.__init__(self)

self.total_downloads = 0
self.queue = Queue(0)
self.report = {'success':[], 'failure':[]}
self.threads = []

for i in range(c.cfg('download-threads')):
thread = self.Downloader(self.queue, self.report)
thread.start()
self.threads.append(thread)
if self.queue.qsize() > 0:
self.queue.join()

def store(self):
c.log('info', 'Waiting downloads complete: ~%i...' % self.queue.qsize())
while not self.queue.empty():
c.log('info', '[%s] %i left' % (''.join([str(int(not t.waiting)) for t in self.threads]), self.queue.qsize()))
time.sleep(5)

self.path = os.path.join(c.cfg('backup-dir'), 'media')
self.stopDownloads()

# Create directory
if not os.path.isdir(self.path):
os.makedirs(self.path)
c.log('info', 'Downloaded %i of %i' % (len(self.report['success']), self.total_downloads))
if len(self.report['failure']) > 0:
c.log('warning', ' failed: %i' % len(self.report['failure']))
for url in self.report['failure']:
c.log('debug', ' %s' % url.url)

Database.store(self)

def loadAttachments(self, data):
attachments = []
if 'attachments' in data:
attachments.extend(data['attachments'])
if 'attachment' in data:
attachments.append(data['attachment'])
if 'copy_history' in data:
self.loadAttachments(data['copy_history'])
for attach in attachments:
c.log('debug', 'Processing %s' % attach['type'])
funcname = "process" + attach['type'].title()
funcname = 'process' + attach['type'].title()
if funcname in dir(self):
getattr(self, funcname)(attach[attach['type']])
else:
c.log('error', ' unable to find attachment processing function "Media.%s"' % funcname)
c.log('debug', str(attach))

def download(self, url, path = None):
def addDownload(self, url, path = None):
if url == '':
c.log('warning', 'Skipping empty url')
return path

if path == None:
path = self.path + urlparse(url).path
path = os.path.join(self.path, 'storage') + urlparse(url).path

if os.path.isfile(path):
c.log('debug', 'Skipping, file %s already exists' % path)
return path

directory = os.path.dirname(path)
if not os.path.isdir(directory):
os.makedirs(directory)

for retry in xrange(3):
try:
u = urllib2.urlopen(url)
with open(path, "wb") as outfile:
size = int(u.info().getheaders("Content-Length")[0])
c.log('debug', 'Downloading %s to %s (%ib)' % (url, path, size))
size_dl = 0
while True:
b = u.read(8192)
if not b:
break
size_dl += len(b)
outfile.write(b)
break
except Exception as e:
c.log('warning', 'Retry request %i (3): %s' % (retry, str(e)))
time.sleep(2.0*(retry+1))
c.log('debug', 'Adding media to queue "%s"' % url)
self.total_downloads += 1
self.queue.put(self.Download(url, path))

return path

def processPhoto(self, data):
url = None
size = 0
for key in data.keys():
if key.startswith('photo_') and int(key.split('_')[1]) > size:
size = int(key.split('_')[1])
url = data.pop(key, None)
def preprocess(self, data, data_type):
# TODO: limit by type
mydata = data.copy()
data.clear()
data['id'] = mydata['id']
if 'owner_id' in mydata:
path = os.path.join(data_type, str(mydata['owner_id']), str(mydata['id']))
data['owner_id'] = mydata['owner_id']
else:
path = os.path.join(data_type, str(mydata['id']))

if path in self.data:
return None
self.data[path] = mydata

if url == None:
c.log('warning', 'Valid url not found in %s' % str(data))
return
return path

data['url'] = url
data['localpath'] = self.download(data['url'])
def processPhoto(self, data):
path = self.preprocess(data, 'photo')
if path != None:
url = None
size = 0
for key in self.data[path].keys():
if key.startswith('photo_'):
if int(key.split('_')[1]) > size:
size = int(key.split('_')[1])
url = self.data[path].pop(key, None)
self.data[path].pop(key, None)

if url == None:
c.log('warning', 'Valid url not found in %s' % str(self.data[path]))
return

self.data[path]['url'] = url
self.data[path]['localpath'] = self.addDownload(self.data[path]['url'])

def processDoc(self, data):
data['localpath'] = self.download(data['url'])
path = self.preprocess(data, 'doc')
if path != None:
self.data[path]['localpath'] = self.addDownload(self.data[path]['url'])

def processAudio(self, data):
data['localpath'] = self.download(data['url'])
path = self.preprocess(data, 'audio')
if path != None:
self.data[path]['localpath'] = self.addDownload(self.data[path]['url'])

def processWall(self, data):
c.log('debug', 'Processing wall attachments')
self.loadAttachments(data)

def processGeo(self, data):
self.preprocess(data, 'geo')
c.log('debug', 'Skipping geo attachment - no data to download')

def processVideo(self, data):
self.preprocess(data, 'video')
c.log('debug', 'Skipping video attachment - size of the file is too big')

def processSticker(self, data):
self.preprocess(data, 'sticker')
c.log('debug', 'Skipping sticker attachment - idiotizm')

def processLink(self, data):
c.log('debug', 'Skipping link attachment - no data to download')

def processPoll(self, data):
self.preprocess(data, 'poll')
c.log('debug', 'Skipping poll attachment - no data to download')

def processNote(self, data):
c.log('debug', 'Skipping poll attachment - no data to download')
self.preprocess(data, 'note')
c.log('debug', 'Skipping note attachment - no data to download')

def processPresent(self, data):
self.preprocess(data, 'present')
c.log('debug', 'Skipping present attachment - stupid present')

S = Media()
2 changes: 2 additions & 0 deletions lib/Messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def requestMessages(request, msgs_data):

while True:
data = Api.request('messages.getHistory', request)
if data == None:
return
count = data['count']
data = data['items']

Expand Down
Loading

0 comments on commit 22b9fb9

Please sign in to comment.