#!/usr/bin/python # vim: et sw=4 ts=4: # -*- coding: utf-8 -*- # # Matomo - free/libre analytics platform # # @link https://matomo.org # @license https://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later # @version $Id$ # # For more info see: https://matomo.org/log-analytics/ and https://matomo.org/docs/log-analytics-tool-how-to/ # # Requires Python 2.6 or 2.7 # import sys if sys.version_info[0] != 2: print('The log importer currently does not work with Python 3 (or higher)') print('Please use Python 2.6 or 2.7') sys.exit(1) import base64 import bz2 import ConfigParser import datetime import fnmatch import gzip import hashlib import httplib import inspect import itertools import logging import optparse import os import os.path import Queue import re import ssl import sys import threading import time import urllib import urllib2 import urlparse import subprocess import functools import traceback import socket import textwrap import glob try: import json except ImportError: try: import simplejson as json except ImportError: if sys.version_info < (2, 6): print >> sys.stderr, 'simplejson (http://pypi.python.org/pypi/simplejson/) is required.' sys.exit(1) ## ## Constants. ## STATIC_EXTENSIONS = set(( 'gif jpg jpeg png bmp ico svg svgz ttf otf eot woff woff2 class swf css js xml webp' ).split()) STATIC_FILES = set(( 'robots.txt' ).split()) DOWNLOAD_EXTENSIONS = set(( '7z aac arc arj asf asx avi bin csv deb dmg doc docx exe flac flv gz gzip hqx ' 'ibooks jar json mpg mp2 mp3 mp4 mpeg mov movie msi msp odb odf odg odp ' 'ods odt ogg ogv pdf phps ppt pptx qt qtm ra ram rar rpm rtf sea sit tar tbz ' 'bz2 tbz tgz torrent txt wav webm wma wmv wpd xls xlsx xml xsd z zip ' 'azw3 epub mobi apk' ).split()) # A good source is: http://phpbb-bots.blogspot.com/ # user agents must be lowercase EXCLUDED_USER_AGENTS = ( 'adsbot-google', 'ask jeeves', 'baidubot', 'bot-', 'bot/', 'ccooter/', 'crawl', 'curl', 'echoping', 'exabot', 'feed', 'googlebot', 'ia_archiver', 'java/', 'libwww', 'mediapartners-google', 'msnbot', 'netcraftsurvey', 'panopta', 'pingdom.com_bot_', 'robot', 'spider', 'surveybot', 'twiceler', 'voilabot', 'yahoo', 'yandex', ) MATOMO_DEFAULT_MAX_ATTEMPTS = 3 MATOMO_DEFAULT_DELAY_AFTER_FAILURE = 10 DEFAULT_SOCKET_TIMEOUT = 300 MATOMO_EXPECTED_IMAGE = base64.b64decode( 'R0lGODlhAQABAIAAAAAAAAAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw==' ) ## ## Formats. ## class BaseFormatException(Exception): pass class BaseFormat(object): def __init__(self, name): self.name = name self.regex = None self.date_format = '%d/%b/%Y:%H:%M:%S' def check_format(self, file): line = file.readline() try: file.seek(0) except IOError: pass return self.check_format_line(line) def check_format_line(self, line): return False class JsonFormat(BaseFormat): def __init__(self, name): super(JsonFormat, self).__init__(name) self.json = None self.date_format = '%Y-%m-%dT%H:%M:%S' def check_format_line(self, line): try: self.json = json.loads(line) return True except: return False def match(self, line): try: # nginx outputs malformed JSON w/ hex escapes when confronted w/ non-UTF input. we have to # workaround this by converting hex escapes in strings to unicode escapes. the conversion is naive, # so it does not take into account the string's actual encoding (which we don't have access to). line = line.replace('\\x', '\\u00') self.json = json.loads(line) return self except: self.json = None return None def get(self, key): # Some ugly patchs ... if key == 'generation_time_milli': self.json[key] = int(float(self.json[key]) * 1000) # Patch date format ISO 8601 elif key == 'date': tz = self.json[key][19:] self.json['timezone'] = tz.replace(':', '') self.json[key] = self.json[key][:19] try: return self.json[key] except KeyError: raise BaseFormatException() def get_all(self,): return self.json def remove_ignored_groups(self, groups): for group in groups: del self.json[group] class RegexFormat(BaseFormat): def __init__(self, name, regex, date_format=None): super(RegexFormat, self).__init__(name) if regex is not None: self.regex = re.compile(regex) if date_format is not None: self.date_format = date_format self.matched = None def check_format_line(self, line): return self.match(line) def match(self,line): if not self.regex: return None match_result = self.regex.match(line) if match_result: self.matched = match_result.groupdict() if 'time' in self.matched: self.matched['date'] = self.matched['date'] + ' ' + self.matched['time'] del self.matched['time'] else: self.matched = None return match_result def get(self, key): try: return self.matched[key] except KeyError: raise BaseFormatException("Cannot find group '%s'." % key) def get_all(self,): return self.matched def remove_ignored_groups(self, groups): for group in groups: del self.matched[group] class W3cExtendedFormat(RegexFormat): FIELDS_LINE_PREFIX = '#Fields: ' REGEX_UNKNOWN_FIELD = r'(?:".*?"|\S+)' fields = { 'date': r'"?(?P\d+[-\d+]+)"?', 'time': r'"?(?P