Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A more efficient and accurate memento and archive counts #284

Merged
merged 2 commits into from
Dec 15, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 13 additions & 20 deletions bundledApps/WAIL.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
# from wx import *
import waybackConfigWriter
from subprocess import Popen, PIPE
from subprocess import check_output

# For a more asynchronous UI, esp with accessible()s
from multiprocessing import Pool as Thread
Expand Down Expand Up @@ -334,19 +333,6 @@ def __init__(self, parent):
self.uri.Bind(wx.EVT_KEY_UP, self.uriChanged) # Call memgator on URI change


def getHosts(self, tm):
matches = re.findall(r'\<(.*)\>; rel=.*memento\"', tm)

hosts = {}
for match in matches:
host = urlparse(match).netloc
if host not in hosts:
hosts[host] = 1
else:
hosts[host] += 1
return hosts


def setMementoCount(self, mCount, aCount=''):
ui_mementoCountMessage_pos = (105, 85)
ui_mementoCountMessage_size = (150, 20)
Expand Down Expand Up @@ -384,20 +370,27 @@ def setMessage(self, msg):
def fetchMementos(self):
# TODO: Use CDXJ for counting the mementos
currentURIValue = self.uri.GetValue()
out = check_output([memGatorPath, "-a", archivesJSON,
print('MEMGATOR checking {0}'.format(currentURIValue))
mg = Popen([memGatorPath,
'--arcs', archivesJSON,
'--format', 'cdxj',
'--restimeout', '0m3s',
'--hdrtimeout', '3s',
'--contimeout', '3s',
currentURIValue])
print('MEMGATOR checking {0}'.format(currentURIValue))
currentURIValue], stdout=PIPE)

# TODO: bug, on Gogo internet MemGator cannot hit aggregator, which
# results in 0 mementos, for which MemGator throws exception

mCount = out.count("memento")
aCount = len(self.getHosts(out))
mCount = 0
archHosts = set()
for line in mg.stdout:
l = line.strip()
if l[:1].isdigit():
mCount += 1
archHosts.add(l.split('/')[2])

self.setMementoCount(mCount, aCount) # UI not updated on Windows
self.setMementoCount(mCount, len(archHosts)) # UI not updated on Windows

print('MEMGATOR counted {0} {1}'.format(currentURIValue, mCount))
# TODO: cache the TM
Expand Down