#!/usr/bin/python # vim: et sw=4 ts=4: # -*- coding: utf-8 -*- # # Matomo - free/libre analytics platform # # @link https://matomo.org # @license https://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later # @version $Id$ # # For more info see: https://matomo.org/log-analytics/ and https://matomo.org/docs/log-analytics-tool-how-to/ # # Requires Python 3.5, 3.6 or 3.7 # from __future__ import print_function # this is needed that python2 can run the script until the warning below import sys if sys.version_info[0] != 3: print('The log importer currently does not support Python 2 any more.') print('Please use Python 3.5, 3.6, 3.7 or 3.8') sys.exit(1) import base64 import bz2 import configparser import codecs import datetime import fnmatch import gzip import hashlib import http.client import inspect import itertools import json import logging import argparse import os import os.path import queue import re import ssl import sys import threading import time import urllib.request, urllib.parse, urllib.error import urllib.request, urllib.error, urllib.parse import urllib.parse import subprocess import traceback import socket import textwrap import collections import glob import io # Avoid "got more than 100 headers" error http.client._MAXHEADERS = 1000 ## ## Constants. ## STATIC_EXTENSIONS = set(( 'gif jpg jpeg png bmp ico svg svgz ttf otf eot woff woff2 class swf css js xml webp' ).split()) STATIC_FILES = set(( 'robots.txt' ).split()) DOWNLOAD_EXTENSIONS = set(( '7z aac arc arj asf asx avi bin csv deb dmg doc docx exe flac flv gz gzip hqx ' 'ibooks jar json mpg mp2 mp3 mp4 mpeg mov movie msi msp odb odf odg odp ' 'ods odt ogg ogv pdf phps ppt pptx qt qtm ra ram rar rpm rtf sea sit tar tbz ' 'bz2 tbz tgz torrent txt wav webm wma wmv wpd xls xlsx xml xsd z zip ' 'azw3 epub mobi apk ' 'md5 sig' ).split()) # If you want to add more bots, take a look at the Matomo Device Detector botlist: # https://github.com/matomo-org/device-detector/blob/master/regexes/bots.yml # user agents must be lowercase EXCLUDED_USER_AGENTS = ( 'adsbot-google', 'ask jeeves', 'baidubot', 'bot-', 'bot/', 'ccooter/', 'crawl', 'curl', 'echoping', 'exabot', 'feed', 'googlebot', 'ia_archiver', 'java/', 'libwww', 'mediapartners-google', 'msnbot', 'netcraftsurvey', 'panopta', 'pingdom.com_bot_', 'robot', 'spider', 'surveybot', 'twiceler', 'voilabot', 'yahoo', 'yandex', 'zabbix', 'googlestackdrivermonitoring', ) MATOMO_DEFAULT_MAX_ATTEMPTS = 3 MATOMO_DEFAULT_DELAY_AFTER_FAILURE = 10 DEFAULT_SOCKET_TIMEOUT = 300 MATOMO_EXPECTED_IMAGE = base64.b64decode( 'R0lGODlhAQABAIAAAAAAAAAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw==' ) ## ## Formats. ## class BaseFormatException(Exception): pass class BaseFormat: def __init__(self, name): self.name = name self.regex = None self.date_format = '%d/%b/%Y:%H:%M:%S' def check_format(self, file): line = file.readline() try: file.seek(0) except IOError: pass return self.check_format_line(line) def check_format_line(self, line): return False class NginxJsonFormat(BaseFormat): def __init__(self, name): super(NginxJsonFormat, self).__init__(name) self.json = None self.date_format = '%Y-%m-%dT%H:%M:%S' def check_format_line(self, line): try: self.json = json.loads(line) # Check if it contains these: "idsite", "referrer", and date". # This is unique to nginx, we can use this to tell it apart from other json log formats. if "idsite" in self.json and "referrer" in self.json and "date" in self.json: return True return False except: return False def match(self, line): try: # nginx outputs malformed JSON w/ hex escapes when confronted w/ non-UTF input. we have to # workaround this by converting hex escapes in strings to unicode escapes. the conversion is naive, # so it does not take into account the string's actual encoding (which we don't have access to). line = line.replace('\\x', '\\u00') self.json = json.loads(line) return self except: self.json = None return None def get(self, key): # Some ugly patchs ... if key == 'generation_time_milli': self.json[key] = int(float(self.json[key]) * 1000) # Patch date format ISO 8601 elif key == 'date': tz = self.json[key][19:] self.json['timezone'] = tz.replace(':', '') self.json[key] = self.json[key][:19] try: return self.json[key] except KeyError: raise BaseFormatException() def get_all(self,): return self.json def remove_ignored_groups(self, groups): for group in groups: del self.json[group] class TraefikJsonFormat(BaseFormat): TRAEFIK_KEYS_MAPPING = { 'date': 'time', 'generation_time_milli': 'Duration', 'host': 'RequestHost', 'ip': 'ClientHost', 'length': 'DownstreamContentSize', 'method': 'RequestMethod', 'path': 'RequestPath', 'referrer': 'request_Referer', 'status': 'DownstreamStatus', 'userid': 'ClientUsername', 'user_agent': 'request_User-Agent', } def __init__(self, name): super(TraefikJsonFormat, self).__init__(name) self.json = None self.date_format = '%Y-%m-%dT%H:%M:%S' def check_format_line(self, line): try: self.json = json.loads(line) # Check if it contains all of these: "level", "msg", and "time". # This is unique to Traefik, we can use this to tell it apart from other json log formats. if "msg" in self.json and "level" in self.json and "time" in self.json: return True return False except: return False def match(self, line): try: self.json = json.loads(line) return self except: self.json = None return None def get(self, key): value = '' try: value = self.json[self.TRAEFIK_KEYS_MAPPING[key]] if key == 'generation_time_milli': value = value / 1000000 # Patch date format ISO 8601, example: 2023-08-14T12:25:56+02:00 if key == 'date': tz = value[19:] # get the last part self.json['timezone'] = tz.replace(':', '') value = value[:19] except: logging.debug("Could not find %s in Traefik log", key) return '' return str(value) def get_all(self,): modified_json = self.json.copy() REVERSED_KEYS_MAPPING = {v: k for k, v in self.TRAEFIK_KEYS_MAPPING.items()} for key in self.json: new_key = REVERSED_KEYS_MAPPING.get(key, key) if new_key != key: modified_json[new_key] = modified_json.pop(key) return modified_json def remove_ignored_groups(self, groups): for group in groups: del self.json[group] class RegexFormat(BaseFormat): def __init__(self, name, regex, date_format=None): super(RegexFormat, self).__init__(name) if regex is not None: self.regex = re.compile(regex) if date_format is not None: self.date_format = date_format self.matched = None def check_format_line(self, line): return self.match(line) def match(self,line): if not self.regex: return None match_result = self.regex.match(line) if match_result: self.matched = match_result.groupdict() if 'time' in self.matched: self.matched['date'] = self.matched['date'] + ' ' + self.matched['time'] del self.matched['time'] else: self.matched = None return match_result def get(self, key): try: return self.matched[key] except KeyError: raise BaseFormatException("Cannot find group '%s'." % key) def get_all(self,): return self.matched def remove_ignored_groups(self, groups): for group in groups: del self.matched[group] class W3cExtendedFormat(RegexFormat): FIELDS_LINE_PREFIX = '#Fields: ' REGEX_UNKNOWN_FIELD = r'(?:".*?"|\S+)' fields = { 'date': r'"?(?P\d+[-\d+]+)"?', 'time': r'"?(?P