import errno
import json
import os
import tempfile
import urllib
from Queue import Empty
from threading import Thread
import numpy as np
import requests
from datetime import datetime
from optparse import OptionParser
from bs4 import BeautifulSoup
from memento_damage import MementoDamage
from multiprocess import Queue, Process, Pool
# from matplotlib import pyplot as plt
def column(matrix, i):
return [row[i] for row in matrix]
class MultiProcess(object):
def __init__(self, cores=4):
self.cores = cores
def map(self, function, sequence, native=False):
def _run(e, q):
q.put(function(e))
def _worker(e):
q = Queue()
t = Thread(target=_run, args=(e, q,))
t.daemon = True
t.start()
t.join(60)
ret = q.get()
return ret
return Pool(self.cores).map(_worker, sequence)
# q_in = Queue()
# for e in sequence: q_in.put(e)
# q_out = Queue()
#
# # def _run(e):
# # q_out.put(function(e))
# #
# # def _process():
# # while True:
# # if q_in.qsize() == 0: break
# #
# # try:
# # t = Thread(target=_run, args=(q_in.get(timeout=5), ))
# # # t.daemon = True
# # t.start()
# # t.join(15)
# # except Empty, e:
# # break
# #
# # processes = []
# # for _ in range(self.cores):
# # p = Process(target=_process)
# # # p.daemon = True
# # processes.append(p)
# #
# # for p in processes: p.start()
# # for p in processes: p.join()
#
# def _run(i, n, e):
# q_out.put(function(e))
#
# def _process(i, n, e):
# q_out.put(function(e))
#
# # t = Thread(target=_run, args=(i, n, e,))
# # t.daemon = True
# # t.start()
# # t.join(60)
#
# processes = []
# for i, e in enumerate(sequence):
# p = Process(target=_process, args=(i, len(sequence), e, ))
# p.daemon = True
# processes.append(p)
#
# while len(processes) > 0:
# ses_processes = []
# for _ in range(self.cores):
# ses_processes.append(processes.pop())
#
# for t in ses_processes: t.start()
# for t in ses_processes: t.join(15)
#
# # def _run(fn, e, q):
# # q.put(fn(e))
# #
# # def _map(fn, list, q):
# # for e in list:
# # t = Thread(target=_run, args=(fn, e, q, ))
# # t.daemon = True
# # t.start()
# # t.join(60)
# #
# # sl_idx = np.array_split(np.array(range(len(sequence))), self.cores)
# #
# # sublist = []
# # for sli in sl_idx:
# # sublist.append([sequence[i] for i in sli])
# #
# # arr_ps = []
# # for sl in sublist:
# # p = Process(target=_map, args=(function, sl, q,))
# # p.daemon = True
# # arr_ps.append(p)
# #
# # for p in arr_ps: p.start()
# # for p in arr_ps: p.join()
#
# results = []
# while q_out.qsize() != 0:
# results += q_out.get()
#
# return results
class URIMCrawler(object):
pool = MultiProcess(100)
def get_uri_ms(self, index_uri, uri_r, idx, total, *args):
print '{0}/{1} Get URI-M for {2} from {3}'.format(idx, total, uri_r, index_uri)
uri_ms = []
try:
resp_index = requests.get(index_uri)
if resp_index.status_code == 200:
resp_index_content = resp_index.content
# Content is in link format, convert it so it looks like html format
resp_index_content = resp_index_content.replace('<', '') \
.replace('\n', '') + ' />'
bs = BeautifulSoup(resp_index_content, 'html.parser')
for memento in bs.findAll('a', rel="memento"):
uri_m = memento['href'].strip()
uri_m_time = datetime.strptime(memento['datetime'], '%a, %d %b %Y %H:%M:%S %Z')
uri_ms.append((uri_r, uri_m, str(uri_m_time.year), str(uri_m_time), ))
except: pass
return uri_ms
def get_uri_ms_wrapper(self, args):
if not (type(args) == list or type(args) == tuple):
args = [args, ]
return self.get_uri_ms(*args)
def get_index_uris(self, uri_r, idx, total):
print '{0}/{1} Get index uris for URI-R = {2}'.format(idx, total, uri_r)
index_uris = []
try:
resp_timemap = requests.get('http://timetravel.mementoweb.org/timemap/json/{}'.format(uri_r))
if resp_timemap.status_code == 200:
resp_timemap_content = resp_timemap.json()
index_uris = [(index['uri'], uri_r, idx, total) for index in resp_timemap_content['timemap_index']]
except: pass
print '{0}/{1} Index uris = {2} for URI-R = {3}'.format(idx, total, len(index_uris), uri_r)
return index_uris
def get_index_uris_wrapper(self, args):
if not (type(args) == list or type(args) == tuple):
args = [args, ]
return self.get_index_uris(*args)
def process_input(self, uri_r_file, uri_r_file_output):
flat_uri_ms = []
if not os.path.exists(uri_r_file_output):
uri_rs = list(set([u.strip() for u in open(uri_r_file).readlines()]))
uri_rs = [(u, i+1, len(uri_rs)) for i, u in enumerate(uri_rs)]
index_uris = self.pool.map(self.get_index_uris_wrapper, uri_rs, native=True)
# Combine into flat arrays
flat_index_uris = []
for iu in index_uris:
flat_index_uris += iu
uri_ms = self.pool.map(self.get_uri_ms_wrapper, flat_index_uris, native=True)
# Combine into flat arrays
for iu in uri_ms:
flat_uri_ms += iu
with open(uri_r_file_output, 'wb') as f:
str_row = []
for r in flat_uri_ms:
str_row.append(','.join(r))
f.write('\n'.join(str_row))
else:
for l in open(uri_r_file_output).readlines():
flat_uri_ms.append(l.strip().split(','))
return flat_uri_ms
class URIMDamage(object):
pool = MultiProcess(50)
def process_uri_m(self, uri_r, uri_m, year, time, outdir):
quoted_url = urllib.quote(uri_m).replace('/', '_').replace('.', '-')
out_json_file = os.path.join(outdir, quoted_url + '.json')
try: os.makedirs(outdir)
except OSError, e:
if e.errno != errno.EEXIST: raise
print 'Processing {0}'.format(uri_m)
exists = True
result = None
if not os.path.exists(out_json_file):
exists = False
m = MementoDamage(uri_m, tempfile.mkdtemp())
m.set_show_debug_message()
m.set_output_mode_json()
# m.set_dont_clean_cache_on_finish()
m.set_follow_redirection()
m.run()
result = m.get_result()
with open(out_json_file, 'wb') as f:
json.dump(result, f, indent=4)
'''
else:
result = json.load(open(out_json_file))
return uri_r, uri_m, year, time, result
'''
print 'Processing {0} Done: {1}'.format(uri_m, 'Exists' if exists else None)
return uri_r, uri_m, year, time
def process_uri_m_wrapper(self, args):
if not (type(args) == list or type(args) == tuple):
args = [args, ]
return self.process_uri_m(*args)
def process(self, uri_ms, outdir):
p_uri_ms = [list(u) + [outdir, ] for u in uri_ms]
return self.pool.map(self.process_uri_m_wrapper, p_uri_ms)
if __name__ == '__main__':
parser = OptionParser()
options, args = parser.parse_args()
if len(args) != 3:
print 'Parameter must be three.'
print ' '
exit()
uri_ms = URIMCrawler().process_input(args[0], args[1])
uri_damages = URIMDamage().process(uri_ms, args[2])
'''
# Make graph
damages_per_year = {}
missings_per_year = {}
for uri_r, uri_m, year, time, damage in uri_damages:
if not damage['error']:
damages_per_year.setdefault(year, [])
damages_per_year[year].append(damage['total_damage'])
csses = damage['csses']
jses = damage['jses']
images = damage['images']
multimedias = damage['multimedias']
resources = csses + jses + images + multimedias
missings = 0
for res in resources:
if 'status_code' in res and res['status_code'] > 399:
missings += 1
missings_per_year[year] = float(missings) / len(resources)
arr_damage_year = []
for year, damages in damages_per_year.items():
arr_damage_year.append((int(year), float(sum(damages)) / len(damages), missings_per_year[year]))
arr_damage_year.sort()
plt.plot(column(arr_damage_year, 0), column(arr_damage_year, 1), 'ro')
plt.plot(column(arr_damage_year, 0), column(arr_damage_year, 1), '--')
plt.plot(column(arr_damage_year, 0), column(arr_damage_year, 2), 'ro')
plt.plot(column(arr_damage_year, 0), column(arr_damage_year, 2), '--')
plt.show()
'''