Project

General

Profile

Feature #20103 » matomo_import_logs.py

Leonardo Candela, Apr 08, 2021 01:27 PM

 
1
#!/usr/bin/python
2
# vim: et sw=4 ts=4:
3
# -*- coding: utf-8 -*-
4
#
5
# Matomo - free/libre analytics platform
6
#
7
# @link https://matomo.org
8
# @license https://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
9
# @version $Id$
10
#
11
# For more info see: https://matomo.org/log-analytics/ and https://matomo.org/docs/log-analytics-tool-how-to/
12
#
13
# Requires Python 3.5, 3.6, 3.7 or higher
14
#
15
from __future__ import print_function  # this is needed that python2 can run the script until the warning below
16
import sys
17

    
18
if sys.version_info[0] != 3:
19
    print('OpenAIRE Generic tracker does not support Python 2 any more.')
20
    print('Please use Python 3.5, 3.6, 3.7 or 3.8')
21
    sys.exit(1)
22

    
23
import bz2
24
import datetime
25
import gzip
26
import http.client as httplib
27
import inspect
28
import itertools
29
import logging
30
import os
31
import os.path
32
import queue as queue
33
import re
34
import sys
35
import threading
36
import time
37
import urllib.request as urllib2
38
import urllib.parse as urlparse
39
import traceback
40
import socket
41
import textwrap
42
import yaml
43
import getopt
44

    
45

    
46
try:
47
    import json
48
except ImportError:
49
    try:
50
        import simplejson as json
51
    except ImportError:
52
        if sys.version_info < (2, 6):
53
            print >> sys.stderr, 'simplejson (http://pypi.python.org/pypi/simplejson/) is required.'
54
            sys.exit(1)
55

    
56

    
57

    
58
##
59
## Constants.
60
##
61

    
62
MATOMO_DEFAULT_MAX_ATTEMPTS = 3
63
MATOMO_DEFAULT_DELAY_AFTER_FAILURE = 10
64
DEFAULT_SOCKET_TIMEOUT = 300
65

    
66
##
67
## Formats.
68
##
69

    
70
class BaseFormatException(Exception): pass
71

    
72
class BaseFormat(object):
73
    def __init__(self, name):
74
        self.name = name
75
        self.regex = None
76
        self.date_format = '%d/%b/%Y:%H:%M:%S'
77

    
78
    def check_format(self, file):
79
        line = file.readline()
80
        try:
81
            file.seek(0)
82
        except IOError:
83
            pass
84

    
85
        return self.check_format_line(line)
86

    
87
    def check_format_line(self, line):
88
        return False
89

    
90
class JsonFormat(BaseFormat):
91
    def __init__(self, name):
92
        super(JsonFormat, self).__init__(name)
93
        self.json = None
94
        self.date_format = '%Y-%m-%dT%H:%M:%S'
95

    
96
    def check_format_line(self, line):
97
        try:
98
            self.json = json.loads(line)
99
            return True
100
        except:
101
            return False
102

    
103
    def match(self, line):
104
        try:
105
            # nginx outputs malformed JSON w/ hex escapes when confronted w/ non-UTF input. we have to
106
            # workaround this by converting hex escapes in strings to unicode escapes. the conversion is naive,
107
            # so it does not take into account the string's actual encoding (which we don't have access to).
108
            line = line.replace('\\x', '\\u00')
109

    
110
            self.json = json.loads(line)
111
            return self
112
        except:
113
            self.json = None
114
            return None
115

    
116
    def get(self, key):
117
        # Some ugly patchs ...
118
        if key == 'generation_time_milli':
119
            self.json[key] =  int(float(self.json[key]) * 1000)
120
        # Patch date format ISO 8601
121
        elif key == 'date':
122
            tz = self.json[key][19:]
123
            self.json['timezone'] = tz.replace(':', '')
124
            self.json[key] = self.json[key][:19]
125

    
126
        try:
127
            return self.json[key]
128
        except KeyError:
129
            raise BaseFormatException()
130

    
131
    def get_all(self,):
132
        return self.json
133

    
134
    def remove_ignored_groups(self, groups):
135
        for group in groups:
136
            del self.json[group]
137

    
138
class RegexFormat(BaseFormat):
139

    
140
    def __init__(self, name, regex, date_format=None):
141
        super(RegexFormat, self).__init__(name)
142
        if regex is not None:
143
            self.regex = re.compile(regex)
144
        if date_format is not None:
145
            self.date_format = date_format
146
        self.matched = None
147

    
148
    def check_format_line(self, line):
149
        return self.match(line)
150

    
151
    def match(self,line):
152
        if not self.regex:
153
            return None
154
        match_result = self.regex.match(line)
155
        if match_result:
156
            self.matched = match_result.groupdict()
157
        else:
158
            self.matched = None
159
        return match_result
160

    
161
    def get(self, key):
162
        try:
163
            return self.matched[key]
164
        except KeyError:
165
            raise BaseFormatException("Cannot find group '%s'." % key)
166

    
167
    def get_all(self,):
168
        return self.matched
169

    
170
    def remove_ignored_groups(self, groups):
171
        for group in groups:
172
            del self.matched[group]
173

    
174
class W3cExtendedFormat(RegexFormat):
175

    
176
    FIELDS_LINE_PREFIX = '#Fields: '
177

    
178
    fields = {
179
        'date': '(?P<date>\d+[-\d+]+',
180
        'time': '[\d+:]+)[.\d]*?', # TODO should not assume date & time will be together not sure how to fix ATM.
181
        'cs-uri-stem': '(?P<path>/\S*)',
182
        'cs-uri-query': '(?P<query_string>\S*)',
183
        'c-ip': '"?(?P<ip>[\w*.:-]*)"?',
184
        'cs(User-Agent)': '(?P<user_agent>".*?"|\S*)',
185
        'cs(Referer)': '(?P<referrer>\S+)',
186
        'sc-status': '(?P<status>\d+)',
187
        'sc-bytes': '(?P<length>\S+)',
188
        'cs-host': '(?P<host>\S+)',
189
        'cs-method': '(?P<method>\S+)',
190
        'cs-username': '(?P<userid>\S+)',
191
        'time-taken': '(?P<generation_time_secs>[.\d]+)'
192
    }
193

    
194
    def __init__(self):
195
        super(W3cExtendedFormat, self).__init__('w3c_extended', None, '%Y-%m-%d %H:%M:%S')
196

    
197
    def check_format(self, file):
198
        self.create_regex(file)
199

    
200
        # if we couldn't create a regex, this file does not follow the W3C extended log file format
201
        if not self.regex:
202
            try:
203
                file.seek(0)
204
            except IOError:
205
                pass
206

    
207
            return
208

    
209
        first_line = file.readline()
210

    
211
        try:
212
            file.seek(0)
213
        except IOError:
214
            pass
215

    
216
        return self.check_format_line(first_line)
217

    
218
    def create_regex(self, file):
219
        fields_line = None
220
        #if config.options.w3c_fields:
221
        #    fields_line = config.options.w3c_fields
222

    
223
        # collect all header lines up until the Fields: line
224
        # if we're reading from stdin, we can't seek, so don't read any more than the Fields line
225
        header_lines = []
226
        while fields_line is None:
227
            line = file.readline().strip()
228

    
229
            if not line:
230
                continue
231

    
232
            if not line.startswith('#'):
233
                break
234

    
235
            if line.startswith(W3cExtendedFormat.FIELDS_LINE_PREFIX):
236
                fields_line = line
237
            else:
238
                header_lines.append(line)
239

    
240
        if not fields_line:
241
            return
242

    
243
        # store the header lines for a later check for IIS
244
        self.header_lines = header_lines
245

    
246
        # Parse the 'Fields: ' line to create the regex to use
247
        full_regex = []
248

    
249
        expected_fields = type(self).fields.copy() # turn custom field mapping into field => regex mapping
250

    
251
        # if the --w3c-time-taken-millisecs option is used, make sure the time-taken field is interpreted as milliseconds
252
        #if config.options.w3c_time_taken_in_millisecs:
253
        #    expected_fields['time-taken'] = '(?P<generation_time_milli>[\d.]+)'
254

    
255
        for mapped_field_name, field_name in config.options.custom_w3c_fields.iteritems():
256
            expected_fields[mapped_field_name] = expected_fields[field_name]
257
            del expected_fields[field_name]
258

    
259
        # add custom field regexes supplied through --w3c-field-regex option
260
        #for field_name, field_regex in config.options.w3c_field_regexes.iteritems():
261
        #    expected_fields[field_name] = field_regex
262

    
263
        # Skip the 'Fields: ' prefix.
264
        fields_line = fields_line[9:].strip()
265
        for field in re.split('\s+', fields_line):
266
            try:
267
                regex = expected_fields[field]
268
            except KeyError:
269
                regex = '(?:".*?"|\S+)'
270
            full_regex.append(regex)
271
        full_regex = '\s+'.join(full_regex)
272

    
273
        logging.debug("Based on 'Fields:' line, computed regex to be %s", full_regex)
274

    
275
        self.regex = re.compile(full_regex)
276

    
277
    def check_for_iis_option(self):
278
       logging.info("WARNING: IIS log file being parsed without --w3c-time-taken-milli option. IIS"
279
                         " stores millisecond values in the time-taken field. If your logfile does this, the aforementioned"
280
                         " option must be used in order to get accurate generation times.")
281

    
282
    def _is_iis(self):
283
        return len([line for line in self.header_lines if 'internet information services' in line.lower() or 'iis' in line.lower()]) > 0
284

    
285
    def _is_time_taken_milli(self):
286
        return 'generation_time_milli' not in self.regex.pattern
287

    
288
class IisFormat(W3cExtendedFormat):
289

    
290
    fields = W3cExtendedFormat.fields.copy()
291
    fields.update({
292
        'time-taken': '(?P<generation_time_milli>[.\d]+)',
293
        'sc-win32-status': '(?P<__win32_status>\S+)' # this group is useless for log importing, but capturing it
294
                                                     # will ensure we always select IIS for the format instead of
295
                                                     # W3C logs when detecting the format. This way there will be
296
                                                     # less accidental importing of IIS logs w/o --w3c-time-taken-milli.
297
    })
298

    
299
    def __init__(self):
300
        super(IisFormat, self).__init__()
301

    
302
        self.name = 'iis'
303

    
304
class ShoutcastFormat(W3cExtendedFormat):
305

    
306
    fields = W3cExtendedFormat.fields.copy()
307
    fields.update({
308
        'c-status': '(?P<status>\d+)',
309
        'x-duration': '(?P<generation_time_secs>[.\d]+)'
310
    })
311

    
312
    def __init__(self):
313
        super(ShoutcastFormat, self).__init__()
314

    
315
        self.name = 'shoutcast'
316

    
317
    def get(self, key):
318
        if key == 'user_agent':
319
            user_agent = super(ShoutcastFormat, self).get(key)
320
            return urllib2.unquote(user_agent)
321
        else:
322
            return super(ShoutcastFormat, self).get(key)
323

    
324
class AmazonCloudFrontFormat(W3cExtendedFormat):
325

    
326
    fields = W3cExtendedFormat.fields.copy()
327
    fields.update({
328
        'x-event': '(?P<event_action>\S+)',
329
        'x-sname': '(?P<event_name>\S+)',
330
        'cs-uri-stem': '(?:rtmp:/)?(?P<path>/\S*)',
331
        'c-user-agent': '(?P<user_agent>".*?"|\S+)',
332

    
333
        # following are present to match cloudfront instead of W3C when we know it's cloudfront
334
        'x-edge-location': '(?P<x_edge_location>".*?"|\S+)',
335
        'x-edge-result-type': '(?P<x_edge_result_type>".*?"|\S+)',
336
        'x-edge-request-id': '(?P<x_edge_request_id>".*?"|\S+)',
337
        'x-host-header': '(?P<x_host_header>".*?"|\S+)'
338
    })
339

    
340
    def __init__(self):
341
        super(AmazonCloudFrontFormat, self).__init__()
342

    
343
        self.name = 'amazon_cloudfront'
344

    
345
    def get(self, key):
346
        if key == 'event_category' and 'event_category' not in self.matched:
347
            return 'cloudfront_rtmp'
348
        elif key == 'status' and 'status' not in self.matched:
349
            return '200'
350
        elif key == 'user_agent':
351
            user_agent = super(AmazonCloudFrontFormat, self).get(key)
352
            return urllib2.unquote(user_agent)
353
        else:
354
            return super(AmazonCloudFrontFormat, self).get(key)
355

    
356
_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)?\s+'
357

    
358
_COMMON_LOG_FORMAT = (
359
    '(?P<ip>[\w*.:-]+)\s+\S+\s+(?P<userid>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+'
360
    '"(?P<method>\S+)\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\d+)\s+(?P<length>\S+)'
361
)
362
_NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT +
363
    '\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
364
)
365
_S3_LOG_FORMAT = (
366
    '\S+\s+(?P<host>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+(?P<ip>[\w*.:-]+)\s+'
367
    '(?P<userid>\S+)\s+\S+\s+\S+\s+\S+\s+"(?P<method>\S+)\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\d+)\s+\S+\s+(?P<length>\S+)\s+'
368
    '\S+\s+\S+\s+\S+\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
369
)
370
_ICECAST2_LOG_FORMAT = ( _NCSA_EXTENDED_LOG_FORMAT +
371
    '\s+(?P<session_time>[0-9-]+)'
372
)
373
_ELB_LOG_FORMAT = (
374
    '(?P<date>[0-9-]+T[0-9:]+)\.\S+\s+\S+\s+(?P<ip>[\w*.:-]+):\d+\s+\S+:\d+\s+\S+\s+(?P<generation_time_secs>\S+)\s+\S+\s+'
375
    '(?P<status>\d+)\s+\S+\s+\S+\s+(?P<length>\S+)\s+'
376
    '"\S+\s+\w+:\/\/(?P<host>[\w\-\.]*):\d+(?P<path>\/\S*)\s+[^"]+"\s+"(?P<user_agent>[^"]+)"\s+\S+\s+\S+'
377
)
378

    
379
_OVH_FORMAT = (
380
    '(?P<ip>\S+)\s+' + _HOST_PREFIX + '(?P<userid>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+'
381
    '"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)'
382
    '\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
383
)
384

    
385

    
386
FORMATS = {
387
    'common': RegexFormat('common', _COMMON_LOG_FORMAT),
388
    'common_vhost': RegexFormat('common_vhost', _HOST_PREFIX + _COMMON_LOG_FORMAT),
389
    'ncsa_extended': RegexFormat('ncsa_extended', _NCSA_EXTENDED_LOG_FORMAT),
390
    'common_complete': RegexFormat('common_complete', _HOST_PREFIX + _NCSA_EXTENDED_LOG_FORMAT),
391
    'w3c_extended': W3cExtendedFormat(),
392
    'amazon_cloudfront': AmazonCloudFrontFormat(),
393
    'iis': IisFormat(),
394
    'shoutcast': ShoutcastFormat(),
395
    's3': RegexFormat('s3', _S3_LOG_FORMAT),
396
    'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT),
397
    'elb': RegexFormat('elb', _ELB_LOG_FORMAT, '%Y-%m-%dT%H:%M:%S'),
398
    'nginx_json': JsonFormat('nginx_json'),
399
    'ovh': RegexFormat('ovh', _OVH_FORMAT)
400
}
401

    
402
##
403
## Code.
404
##
405

    
406
class Configuration(object):
407
    """
408
    Stores all the configuration options by reading sys.argv and parsing,
409
    if needed, the config.inc.php.
410

    
411
    It has 2 attributes: options and filenames.
412
    """
413

    
414
    class Error(Exception):
415
        pass
416

    
417
    def _create_parser(self):
418
        matomoConfig = None
419
        with open("matomo_config.yaml", 'r') as stream:
420
            try:
421
                matomoConfig=yaml.load(stream, Loader=yaml.FullLoader)
422
            except yaml.YAMLError as exc:
423
                print(exc)
424

    
425

    
426
        """
427
        Initialize and return the OptionParser instance.
428
        """
429
        self.options = matomoConfig
430
        return self.options
431

    
432

    
433
    def _parse_args(self, options):
434
        """
435
        Parse the command line args and create self.options and self.filenames.
436
        """
437
        filePath = os.path.abspath(os.path.abspath(sys.argv[-1]))
438
        if os.path.isdir(filePath):
439
            self.filenames  = [(filePath+"/"+x) for x in os.listdir(filePath)]
440
        elif os.path.isfile(filePath):
441
            self.filenames = [filePath]
442
        
443
        # Configure logging before calling logging.{debug,info}.
444
        logging.basicConfig(
445
            format='%(asctime)s: [%(levelname)s] %(message)s',
446
            filename='Matomo_import.log',
447
            level=logging.INFO,
448
        )
449

    
450
    def __init__(self):
451
        self._parse_args(self._create_parser())
452

    
453
    def get_resolver(self):
454
        if self.options.site_id:
455
            logging.debug('Resolver: static')
456
            return StaticResolver(self.options.site_id)
457
        else:
458
            logging.debug('Resolver: dynamic')
459
            return DynamicResolver()
460

    
461

    
462

    
463
class UrlHelper(object):
464

    
465
    @staticmethod
466
    def convert_array_args(args):
467
        """
468
        Converts PHP deep query param arrays (eg, w/ names like hsr_ev[abc][0][]=value) into a nested list/dict
469
        structure that will convert correctly to JSON.
470
        """
471

    
472
        final_args = {}
473
        for key, value in args.items():
474
            indices = key.split('[')
475
            if '[' in key:
476
                # contains list of all indices, eg for abc[def][ghi][] = 123, indices would be ['abc', 'def', 'ghi', '']
477
                indices = [i.rstrip(']') for i in indices]
478

    
479
                # navigate the multidimensional array final_args, creating lists/dicts when needed, using indices
480
                element = final_args
481
                for i in range(0, len(indices) - 1):
482
                    idx = indices[i]
483

    
484
                    # if there's no next key, then this element is a list, otherwise a dict
485
                    element_type = list if not indices[i + 1] else dict
486
                    if idx not in element or not isinstance(element[idx], element_type):
487
                        element[idx] = element_type()
488

    
489
                    element = element[idx]
490

    
491
                # set the value in the final container we navigated to
492
                if not indices[-1]: # last indice is '[]'
493
                    element.append(value)
494
                else: # last indice has a key, eg, '[abc]'
495
                    element[indices[-1]] = value
496
            else:
497
                final_args[key] = value
498

    
499
        return UrlHelper._convert_dicts_to_arrays(final_args)
500

    
501
    @staticmethod
502
    def _convert_dicts_to_arrays(d):
503
        # convert dicts that have contiguous integer keys to arrays
504
        for key, value in d.items():
505
            if not isinstance(value, dict):
506
                continue
507

    
508
            if UrlHelper._has_contiguous_int_keys(value):
509
                d[key] = UrlHelper._convert_dict_to_array(value)
510
            else:
511
                d[key] = UrlHelper._convert_dicts_to_arrays(value)
512

    
513
        return d
514

    
515
    @staticmethod
516
    def _has_contiguous_int_keys(d):
517
        for i in range(0, len(d)):
518
            if str(i) not in d:
519
                return False
520
        return True
521

    
522
    @staticmethod
523
    def _convert_dict_to_array(d):
524
        result = []
525
        for i in range(0, len(d)):
526
            result.append(d[str(i)])
527
        return result
528

    
529

    
530
class Matomo(object):
531
    """
532
    Make requests to Matomo.
533
    """
534
    class Error(Exception):
535

    
536
        def __init__(self, message, code = None):
537
            super(Exception, self).__init__(message)
538

    
539
            self.code = code
540

    
541
    class RedirectHandlerWithLogging(urllib2.HTTPRedirectHandler):
542
        """
543
        Special implementation of HTTPRedirectHandler that logs redirects in debug mode
544
        to help users debug system issues.
545
        """
546

    
547
        def redirect_request(self, req, fp, code, msg, hdrs, newurl):
548
            logging.debug("Request redirected (code: %s) to '%s'" % (code, newurl))
549

    
550
            return urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, hdrs, newurl)
551

    
552
    @staticmethod
553
    def _call(path, args, headers=None, url=None, data=None):
554
        """
555
        Make a request to the Matomo site. It is up to the caller to format
556
        arguments, to embed authentication, etc.
557
        """
558
        if url is None:
559
            url = config.options['Matomo_Parameters']['matomo_url']
560
        headers = headers or {}
561
        if data is None:
562
            # If Content-Type isn't defined, PHP do not parse the request's body.
563
            headers['Content-type'] = 'application/x-www-form-urlencoded'
564
            data = urllib.urlencode(args)
565
        elif not isinstance(data, str) and headers['Content-type'] == 'application/json':
566
            data = json.dumps(data).encode("utf-8")
567

    
568
            if args:
569
                path = path + '?' + urllib.urlencode(args)
570

    
571
        headers['User-Agent'] = 'Matomo/LogImport'
572

    
573
        try:
574
            timeout = config.options['Matomo_Parameters']['default_socket_timeout']
575
        except:
576
            timeout = None # the config global object may not be created at this point
577

    
578
        request = urllib2.Request(url + path, data, headers)
579

    
580

    
581
        # Use non-default SSL context if invalid certificates shall be
582
        # accepted.
583
        '''
584
        if config.options.accept_invalid_ssl_certificate and \
585
                sys.version_info >= (2, 7, 9):
586
            ssl_context = ssl.create_default_context()
587
            ssl_context.check_hostname = False
588
            ssl_context.verify_mode = ssl.CERT_NONE
589
            https_handler_args = {'context': ssl_context}
590
        else:
591
            https_handler_args = {}
592
        opener = urllib2.build_opener(
593
            Matomo.RedirectHandlerWithLogging(),
594
            urllib2.HTTPSHandler(**https_handler_args))
595
        response = opener.open(request, timeout = timeout)
596
        result = response.read()
597
        response.close()
598
        return result
599
        '''
600
        https_handler_args = {}
601
        opener = urllib2.build_opener(
602
            Matomo.RedirectHandlerWithLogging(),
603
            urllib2.HTTPSHandler(**https_handler_args))
604
        response = opener.open(request, timeout = timeout)
605
        result = response.read()
606
        response.close()
607
        return result
608

    
609
    @staticmethod
610
    def _call_api(method, **kwargs):
611
        """
612
        Make a request to the Matomo API taking care of authentication, body
613
        formatting, etc.
614
        """
615
        args = {
616
            'module' : 'API',
617
            'format' : 'json2',
618
            'method' : method,
619
            'filter_limit' : '-1',
620
        }
621
        if kwargs:
622
            args.update(kwargs)
623

    
624
        # Convert lists into appropriate format.
625
        # See: http://developer.matomo.org/api-reference/reporting-api#passing-an-array-of-data-as-a-parameter
626
        # Warning: we have to pass the parameters in order: foo[0], foo[1], foo[2]
627
        # and not foo[1], foo[0], foo[2] (it will break Matomo otherwise.)
628
        final_args = []
629
        for key, value in args.items():
630
            if isinstance(value, (list, tuple)):
631
                for index, obj in enumerate(value):
632
                    final_args.append(('%s[%d]' % (key, index), obj))
633
            else:
634
                final_args.append((key, value))
635

    
636

    
637
#        logging.debug('%s' % final_args)
638
#        logging.debug('%s' % url)
639

    
640
        res = Matomo._call('/', final_args, url=url)
641

    
642
        try:
643
            return json.loads(res)
644
        except ValueError:
645
            raise urllib2.URLError('Matomo returned an invalid response: ' + res)
646

    
647
    @staticmethod
648
    def _call_wrapper(func, expected_response, on_failure, *args, **kwargs):
649
        """
650
        Try to make requests to Matomo at most MATOMO_FAILURE_MAX_RETRY times.
651
        """
652
        errors = 0
653
        while True:
654
            try:
655
                response = func(*args, **kwargs)
656
                if expected_response is not None and response != expected_response:
657
                    if on_failure is not None:
658
                        error_message = on_failure(response, kwargs.get('data'))
659
                    else:
660
                        error_message = "didn't receive the expected response. Response was %s " % response
661

    
662
                    raise urllib2.URLError(error_message)
663
                return response
664
            except (urllib2.URLError, httplib.HTTPException, ValueError, socket.timeout) as e:
665
                logging.info('Error when connecting to Matomo: %s', e)
666

    
667
                code = None
668
                if isinstance(e, urllib2.HTTPError):
669
                    # See Python issue 13211.
670
                    message = 'HTTP Error %s %s' % (e.code, e.msg)
671
                    code = e.code
672
                elif isinstance(e, urllib2.URLError):
673
                    message = e.reason
674
                else:
675
                    message = str(e)
676

    
677
                # decorate message w/ HTTP response, if it can be retrieved
678
                if hasattr(e, 'read'):
679
                    message = message + ", response: " + e.read()
680

    
681
                try:
682
                    delay_after_failure = config.options["Matomo_Parameters"]["delay_after_failure"]
683
                    max_attempts = config.options["Matomo_Parameters"]["default_max_attempts"]
684
                except NameError:
685
                    delay_after_failure = MATOMO_DEFAULT_DELAY_AFTER_FAILURE
686
                    max_attempts = MATOMO_DEFAULT_MAX_ATTEMPTS
687

    
688
                errors += 1
689
                if errors == max_attempts:
690
                    logging.info("Max number of attempts reached, server is unreachable!")
691

    
692
                    raise Matomo.Error(message, code)
693
                else:
694
                    logging.info("Retrying request, attempt number %d" % (errors + 1))
695

    
696
                    time.sleep(delay_after_failure)
697

    
698
    @classmethod
699
    def call(cls, path, args, expected_content=None, headers=None, data=None, on_failure=None):
700
        return cls._call_wrapper(cls._call, expected_content, on_failure, path, args, headers,
701
                                    data=data)
702

    
703
    @classmethod
704
    def call_api(cls, method, **kwargs):
705
        return cls._call_wrapper(cls._call_api, None, None, method, **kwargs)
706

    
707
class Recorder(object):
708
    """
709
    A Recorder fetches hits from the Queue and inserts them into Matomo using
710
    the API.
711
    """
712

    
713
    recorders = []
714

    
715
    def __init__(self):
716
        self.queue = queue.Queue(maxsize=2)
717

    
718
        # if bulk tracking disabled, make sure we can store hits outside of the Queue
719
        #if not config.options.use_bulk_tracking:
720
        #    self.unrecorded_hits = []
721

    
722
    @classmethod
723
    def launch(cls, recorder_count):
724
        """
725
        Launch a bunch of Recorder objects in a separate thread.
726
        """
727
        for i in range(recorder_count):            
728
            recorder = Recorder()
729
            cls.recorders.append(recorder)
730

    
731
            #run = recorder._run_bulk if config.options.use_bulk_tracking else recorder._run_single
732
            run = recorder._run_bulk
733
            t = threading.Thread(target=run)
734

    
735
            t.daemon = True
736
            t.start()
737
            logging.debug('Launched recorder')
738

    
739
    @classmethod
740
    def add_hits(cls, all_hits):
741
        """
742
        Add a set of hits to the recorders queue.
743
        """
744
        # Organize hits so that one client IP will always use the same queue.
745
        # We have to do this so visits from the same IP will be added in the right order.
746
        hits_by_client = [[] for r in cls.recorders]
747
        for hit in all_hits:
748
            hits_by_client[hit.get_visitor_id_hash() % len(cls.recorders)].append(hit)
749

    
750
        for i, recorder in enumerate(cls.recorders):
751
            recorder.queue.put(hits_by_client[i])
752

    
753
    @classmethod
754
    def wait_empty(cls):
755
        """
756
        Wait until all recorders have an empty queue.
757
        """
758
        for recorder in cls.recorders:
759
            recorder._wait_empty()
760

    
761
    def _run_bulk(self):
762
        while True:
763
            try:
764
                hits = self.queue.get()
765
            except:
766
                # TODO: we should log something here, however when this happens, logging.etc will throw
767
                return
768

    
769
            if len(hits) > 0:
770
                try:
771
                    self._record_hits(hits)
772
                except Matomo.Error as e:
773
                    fatal_error(e, hits[0].filename, hits[0].lineno) # approximate location of error
774
            self.queue.task_done()
775

    
776
    def _run_single(self):
777
        while True:
778

    
779
            if len(self.unrecorded_hits) > 0:
780
                hit = self.unrecorded_hits.pop(0)
781

    
782
                try:
783
                    self._record_hits([hit])
784
                except Matomo.Error as e:
785
                    fatal_error(e, hit.filename, hit.lineno)
786
            else:
787
                self.unrecorded_hits = self.queue.get()
788
                self.queue.task_done()
789

    
790
    def _wait_empty(self):
791
        """
792
        Wait until the queue is empty.
793
        """
794
        while True:
795
            if self.queue.empty():
796
                # We still have to wait for the last queue item being processed
797
                # (queue.empty() returns True before queue.task_done() is
798
                # called).
799
                self.queue.join()
800
                return
801
            time.sleep(1)
802

    
803
    def date_to_matomo(self, date):
804
        date, time = date.isoformat(sep=' ').split()
805
        return '%s %s' % (date, time.replace('-', ':'))
806

    
807
    def _get_hit_args(self, hit):
808
        """
809
        Returns the args used in tracking a hit, without the token_auth.
810
        """
811
        #site_id, main_url = resolver.resolve(hit)
812
        site_id = config.options['Matomo_Parameters']['idSite']
813
        #repositoy base url
814
        main_url = config.options['Matomo_Parameters']['repository_base_url']
815

    
816
        #stats.dates_recorded.add(hit.date.date())
817

    
818
        path = hit.path
819

    
820
        '''
821
        query_string_delimiter="?"
822
        if hit.query_string:
823
            path += config.options.query_string_delimiter + hit.query_string
824
        '''
825

    
826
        # only prepend main url / host if it's a path
827
        url_prefix = self._get_host_with_protocol(hit.host, main_url) if hasattr(hit, 'host') else main_url
828
        url = (url_prefix if path.startswith('/') else '') + path[:1024]
829

    
830
        # handle custom variables before generating args dict
831
        #if hit.is_robot:
832
        #    hit.add_visit_custom_var("Bot", hit.user_agent)
833
        #else:
834
        #    hit.add_visit_custom_var("Not-Bot", hit.user_agent)
835

    
836
        if (hit.referrer.find("?") >=0):
837
            hit.referrer = hit.referrer.split("?")[0]+" "
838

    
839

    
840
        args = {
841
            'rec': '1',
842
            'apiv': '1',
843
            'url': url,
844
            'urlref': hit.referrer[:1024],
845
            'cip': hit.ip,
846
            'cdt': self.date_to_matomo(hit.date),
847
            'idsite': site_id,
848
            'ua': hit.user_agent
849
        }
850

    
851
        # idsite is already determined by resolver
852
        if 'idsite' in hit.args:
853
            del hit.args['idsite']
854
            
855
        args.update(hit.args)
856

    
857
        if hit.is_download:
858
            args['download'] = args['url']
859

    
860
        #if config.options.enable_bots:
861
        args['bots'] = '1'
862

    
863
        '''
864
        if hit.is_error or hit.is_redirect:
865
            args['action_name'] = '%s%sURL = %s%s' % (
866
                hit.status, '/',
867
                urllib.quote(args['url'], ''),
868
                ("%sFrom = %s" % (
869
                    '/',
870
                    urllib.quote(args['urlref'], '')
871
                ) if args['urlref'] != ''  else '')
872
            )
873
        '''
874
        if hit.generation_time_milli > 0:
875
            args['gt_ms'] = int(hit.generation_time_milli)
876

    
877
        if hit.event_category and hit.event_action:
878
            args['e_c'] = hit.event_category
879
            args['e_a'] = hit.event_action
880

    
881
            if hit.event_name:
882
                args['e_n'] = hit.event_name
883

    
884
        if hit.length:
885
            args['bw_bytes'] = hit.length
886

    
887
        # convert custom variable args to JSON
888
        if 'cvar' in args and not isinstance(args['cvar'], str):
889
            args['cvar'] = json.dumps(args['cvar'])
890

    
891
        if '_cvar' in args and not isinstance(args['_cvar'], str):
892
            args['_cvar'] = json.dumps(args['_cvar'])
893

    
894
        return UrlHelper.convert_array_args(args)
895

    
896
    def _get_host_with_protocol(self, host, main_url):
897
        if '://' not in host:
898
            parts = urlparse.urlparse(main_url)
899
            host = parts.scheme + '://' + host
900
        return host
901

    
902
    def _record_hits(self, hits):
903
        """
904
        Inserts several hits into Matomo.
905
        """
906

    
907
        #if not config.options.dry_run:
908
        data = {
909
            'token_auth': config.options['Matomo_Parameters']['token_auth'],
910
            'requests': [self._get_hit_args(hit) for hit in hits]
911
        }
912

    
913
        try:
914
            args = {}
915

    
916
            response = matomo.call(
917
                '/piwik.php', args=args,
918
                expected_content=None,
919
                headers={'Content-type': 'application/json'},
920
                data=data,
921
                on_failure=self._on_tracking_failure
922
            )
923
            # check for invalid requests
924
            try:
925
                response = json.loads(response)
926
            except:
927
                logging.info("bulk tracking returned invalid JSON")
928

    
929
                response = {}
930

    
931
            if ('invalid_indices' in response and isinstance(response['invalid_indices'], list) and
932
                response['invalid_indices']):
933
                invalid_count = len(response['invalid_indices'])
934

    
935
                invalid_lines = [str(hits[index].lineno) for index in response['invalid_indices']]
936
                invalid_lines_str = ", ".join(invalid_lines)
937

    
938
                #stats.invalid_lines.extend(invalid_lines)
939

    
940
                logging.info("The Matomo tracker identified %s invalid requests on lines: %s" % (invalid_count, invalid_lines_str))
941
            elif 'invalid' in response and response['invalid'] > 0:
942
                logging.info("The Matomo tracker identified %s invalid requests." % response['invalid'])
943
        except Matomo.Error as e:
944
            # if the server returned 400 code, BulkTracking may not be enabled
945
            if e.code == 400:
946
                fatal_error("Server returned status 400 (Bad Request).\nIs the BulkTracking plugin disabled?", hits[0].filename, hits[0].lineno)
947

    
948
            raise
949

    
950
        stats.count_lines_recorded.advance(len(hits))
951

    
952

    
953
    def _is_json(self, result):
954
        try:
955
            json.loads(result)
956
            return True
957
        except ValueError as e:
958
            return False
959

    
960
    def _on_tracking_failure(self, response, data):
961
        """
962
        Removes the successfully tracked hits from the request payload so
963
        they are not logged twice.
964
        """
965
        try:
966
            response = json.loads(response)
967
        except:
968
            # the response should be in JSON, but in case it can't be parsed just try another attempt
969
            logging.debug("cannot parse tracker response, should be valid JSON")
970
            return response
971

    
972
        # remove the successfully tracked hits from payload
973
        tracked = response['tracked']
974
        data['requests'] = data['requests'][tracked:]
975

    
976
        return response['message']
977

    
978
class Hit(object):
979
    """
980
    It's a simple container.
981
    """
982
    def __init__(self, **kwargs):
983
        for key, value in kwargs.items():
984
            setattr(self, key, value)
985
        super(Hit, self).__init__()
986

    
987

    
988
    def get_visitor_id_hash(self):
989
        visitor_id = self.ip
990
        '''
991
        if config.options.replay_tracking:
992
            for param_name_to_use in ['uid', 'cid', '_id', 'cip']:
993
                if param_name_to_use in self.args:
994
                    visitor_id = self.args[param_name_to_use]
995
                    break
996
        '''
997
        return abs(hash(visitor_id))
998

    
999
    def add_page_custom_var(self, key, value):
1000
        """
1001
        Adds a page custom variable to this Hit.
1002
        """
1003
        self._add_custom_var(key, value, 'cvar')
1004

    
1005
    def add_visit_custom_var(self, key, value):
1006
        """
1007
        Adds a visit custom variable to this Hit.
1008
        """
1009
        self._add_custom_var(key, value, '_cvar')
1010

    
1011
    def _add_custom_var(self, key, value, api_arg_name):
1012
        if api_arg_name not in self.args:
1013
            self.args[api_arg_name] = {}
1014

    
1015
        if isinstance(self.args[api_arg_name], str):
1016
            logging.debug("Ignoring custom %s variable addition [ %s = %s ], custom var already set to string." % (api_arg_name, key, value))
1017
            return
1018

    
1019
        index = len(self.args[api_arg_name]) + 1
1020
        self.args[api_arg_name][index] = [key, value]
1021

    
1022
class CheckRobots(object):
1023
    import os, ssl
1024
    if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
1025
        ssl._create_default_https_context = ssl._create_unverified_context
1026
    
1027
    def _readCOUNTERRobots(self):
1028
        url = config.options["Matomo_Parameters"]["COUNTER_Robots_url"]
1029
        response = urllib2.urlopen(url)
1030

    
1031
        self.counterRobotsList = json.loads(response.read())
1032
        return self.counterRobotsList
1033

    
1034
    def __init__(self):
1035
        self._readCOUNTERRobots()
1036

    
1037

    
1038

    
1039
class Parser(object):
1040
    """
1041
    The Parser parses the lines in a specified file and inserts them into
1042
    a Queue.
1043
    """
1044

    
1045
    def __init__(self):
1046
        self.check_methods = [method for name, method
1047
                              in inspect.getmembers(self, predicate=inspect.ismethod)
1048
                              if name.startswith('check_')]
1049

    
1050
    ## All check_* methods are called for each hit and must return True if the
1051
    ## hit can be imported, False otherwise.
1052

    
1053

    
1054
    def check_static(self, hit):
1055
        hit.path=urlparse.unquote(hit.path)
1056
        if config.options["Matomo_Parameters"]["tracking_metadata"] is not None:
1057
            for i in config.options["Matomo_Parameters"]["tracking_metadata"]:
1058
                    pattern = re.compile(i)
1059
                    if pattern.match(hit.path):
1060
                        patternOAI=re.compile(i)
1061
                        if patternOAI.match(hit.path):
1062
                            finalOAIpmh=config.options["Matomo_Parameters"]["oaipmh_preamble"]+patternOAI.match(hit.path).group(1)[patternOAI.match(hit.path).group(1).rfind("/")+1:]
1063
                            if finalOAIpmh!=config.options["Matomo_Parameters"]["oaipmh_preamble"]:
1064
                                hit.add_page_custom_var("oaipmhID",finalOAIpmh)
1065
                                hit.is_meta=True
1066
        return True
1067

    
1068
    def check_download(self, hit):
1069
        hit.path=urlparse.unquote(hit.path)
1070
        if config.options["Matomo_Parameters"]["tracking_download"] is not None:
1071
            for i in config.options["Matomo_Parameters"]["tracking_download"]:
1072
                pattern = re.compile(i)
1073
                if pattern.match(hit.path):
1074
                    patternOAI=re.compile(i)
1075
                    if patternOAI.match(hit.path):
1076
                        finalOAIpmh=config.options["Matomo_Parameters"]["oaipmh_preamble"]+patternOAI.match(hit.path).group(1)[patternOAI.match(hit.path).group(1).rfind("/")+1:]
1077
                        if finalOAIpmh!=config.options["Matomo_Parameters"]["oaipmh_preamble"]:
1078
                            hit.add_page_custom_var("oaipmhID",finalOAIpmh)
1079
                            hit.is_download = True
1080
        return True
1081

    
1082
    def check_user_agent(self, hit):
1083
        user_agent = hit.user_agent
1084
        for p in checkRobots.counterRobotsList:
1085
            pattern = re.compile(p['pattern'])
1086
            if pattern.search(user_agent):
1087
                stats.count_lines_skipped_user_agent.increment()
1088
                hit.is_robot = True
1089
                break
1090
        return True
1091

    
1092
    def check_http_error(self, hit):
1093
        if hit.status[0] in ('4', '5'):
1094
            hit.is_error = True
1095
            return True
1096
        return True
1097

    
1098
    def check_http_redirect(self, hit):
1099
        if hit.status[0] == '3' and hit.status != '304':
1100
             hit.is_redirect = True
1101
             return True
1102
        return True
1103
    @staticmethod
1104
    def check_format(lineOrFile):
1105
        format = False
1106
        format_groups = 0
1107
        if config.options['Matomo_Parameters']['LogFileFormat']:
1108
            _INPUT_FORMAT = (config.options['Matomo_Parameters']['LogFileFormat'])
1109
            FORMATS['custom_input_format']=RegexFormat('custom_input_format', _INPUT_FORMAT)
1110
        for name, candidate_format in FORMATS.items():
1111
            logging.debug("Check format %s", name)
1112

    
1113
            # skip auto detection for formats that can't be detected automatically
1114
            if name == 'ovh':
1115
                continue
1116

    
1117
            match = None
1118
            try:
1119
                if isinstance(lineOrFile, str):
1120
                    match = candidate_format.check_format_line(lineOrFile)
1121
                else:
1122
                    match = candidate_format.check_format(lineOrFile)
1123
            except Exception as e:
1124
                logging.debug('Error in format checking: %s', traceback.format_exc())
1125
                pass
1126

    
1127
            if match:
1128
                logging.debug('Format %s matches', name)
1129

    
1130
                # compare format groups if this *BaseFormat has groups() method
1131
                try:
1132
                    # if there's more info in this match, use this format
1133
                    match_groups = len(match.groups())
1134

    
1135
                    logging.debug('Format match contains %d groups' % match_groups)
1136

    
1137
                    if format_groups < match_groups:
1138
                        format = candidate_format
1139
                        format_groups = match_groups
1140
                except AttributeError:
1141
                    format = candidate_format
1142

    
1143
            else:
1144
                logging.debug('Format %s does not match', name)
1145

    
1146
        # if the format is W3cExtendedFormat, check if the logs are from IIS and if so, issue a warning if the
1147
        # --w3c-time-taken-milli option isn't set
1148
        if isinstance(format, W3cExtendedFormat):
1149
            format.check_for_iis_option()
1150
        # dpie check
1151
        #print "Format name "+format.name
1152
        return format
1153

    
1154
    @staticmethod
1155
    def detect_format(file):
1156
        """
1157
        Return the best matching format for this file, or None if none was found.
1158
        """
1159
        logging.debug('Detecting the log format')
1160

    
1161
        format = False
1162

    
1163
        # check the format using the file (for formats like the W3cExtendedFormat one)
1164
        format = Parser.check_format(file)
1165
        # check the format using the first N lines (to avoid irregular ones)
1166
        lineno = 0
1167
        limit = 100000
1168
        while not format and lineno < limit:
1169
            line = file.readline()
1170
            if not line: # if at eof, don't keep looping
1171
                break
1172

    
1173
            lineno = lineno + 1
1174

    
1175
            logging.debug("Detecting format against line %i" % lineno)
1176
            format = Parser.check_format(line)
1177

    
1178
        try:
1179
            file.seek(0)
1180
        except IOError:
1181
            pass
1182

    
1183
        if not format:
1184
            fatal_error("Cannot automatically determine the log format using the first %d lines of the log file. " % limit +
1185
                        "\nMaybe try specifying the format in LogFileFormat variable of yaml file." )
1186
            return
1187

    
1188
        logging.debug('Format %s is the best match', format.name)
1189
        return format
1190

    
1191
    def is_filtered(self, hit):
1192
        host = None
1193
        if hasattr(hit, 'host'):
1194
            host = hit.host
1195
        else:
1196
            try:
1197
                host = urlparse.urlparse(hit.path).hostname
1198
            except:
1199
                pass
1200
        return (False, None)
1201

    
1202
    def parse(self, filename):
1203
        """
1204
        Parse the specified filename and insert hits in the queue.
1205
        """
1206
        def invalid_line(line, reason):
1207
            logging.debug('Invalid line detected (%s): %s' % (reason, line))
1208

    
1209
        def filtered_line(line, reason):
1210
            logging.debug('Filtered line out (%s): %s' % (reason, line))
1211

    
1212
        if filename == '-':
1213
            filename = '(stdin)'
1214
            file = sys.stdin
1215
        else:
1216
            if not os.path.exists(filename):
1217
                #print >> sys.stderr, "\n=====> Warning: File %s does not exist <=====" % filename
1218
                print("\n=====> Warning: File %s does not exist <=====" % filename, file=sys.stderr)
1219
                return
1220
            else:
1221
                if filename.endswith('.bz2'):
1222
                    open_func = bz2.BZ2File
1223
                elif filename.endswith('.gz'):
1224
                    open_func = gzip.open
1225
                else:
1226
                    open_func = open
1227
                file = open_func(filename, 'r')
1228

    
1229
        format = self.detect_format(file)
1230
        if format is None:
1231
            return fatal_error(
1232
                'Cannot guess the logs format. Please give one using '
1233
                'either the --log-format-name or --log-format-regex option'
1234
            )
1235
        # Make sure the format is compatible with the resolver.
1236
        #resolver.check_format(format)
1237
        valid_lines_count = 0
1238
        hits = []
1239
        lineno = -1
1240
        while True:
1241
            line = file.readline()
1242

    
1243
            if not line: break
1244
            lineno = lineno + 1
1245

    
1246
            stats.count_lines_parsed.increment()
1247
            skiplines=0
1248
            opts, args = getopt.getopt(sys.argv[1:],"s:",["skip="])
1249
            for opt, arg in opts:
1250
                if  opt in ("-s", "--skip"):
1251
                    skiplines = arg
1252
            if stats.count_lines_parsed.value <= int(skiplines):
1253
                continue
1254

    
1255
            match = format.match(line)
1256
            if not match:
1257
                invalid_line(line, 'line did not match')
1258
                continue
1259

    
1260
            valid_lines_count = valid_lines_count + 1
1261

    
1262
            hit = Hit(
1263
                filename=filename,
1264
                lineno=lineno,
1265
                status=format.get('status'),
1266
                full_path=format.get('path'),
1267
                is_meta=False,
1268
                is_download=False,
1269
                is_robot=False,
1270
                is_error=False,
1271
                is_redirect=False,
1272
                args={},
1273
            )
1274
            '''
1275
            todelete
1276
            # Add http method page cvar
1277
            try:
1278
                httpmethod = format.get('method')
1279
                if config.options.track_http_method and httpmethod != '-':
1280
                    hit.add_page_custom_var('HTTP-method', httpmethod)
1281
            except:
1282
                pass
1283
            '''
1284
            # W3cExtendedFormat detaults to - when there is no query string, but we want empty string
1285
            hit.query_string = ''
1286
            hit.path = hit.full_path
1287

    
1288
            try:
1289
                hit.referrer = format.get('referrer')
1290

    
1291
                if hit.referrer.startswith('"'):
1292
                    hit.referrer = hit.referrer[1:-1]
1293
            except BaseFormatException:
1294
                hit.referrer = ''
1295
            if hit.referrer == '-':
1296
                hit.referrer = ''
1297

    
1298
            try:
1299
                hit.user_agent = format.get('user_agent')
1300

    
1301
                # in case a format parser included enclosing quotes, remove them so they are not
1302
                # sent to Matomo
1303
                if hit.user_agent.startswith('"'):
1304
                    hit.user_agent = hit.user_agent[1:-1]
1305
            except BaseFormatException:
1306
                hit.user_agent = ''
1307

    
1308
            hit.ip = format.get('ip')
1309

    
1310
            #IP anonymization
1311
            if config.options['Matomo_Parameters']['ip_anonymization'] is True:
1312
                hit.ip = hit.ip.split('.')[0]+"."+hit.ip.split('.')[1]+".0.0"
1313

    
1314
            try:
1315
                hit.length = int(format.get('length'))
1316
            except (ValueError, BaseFormatException):
1317
                # Some lines or formats don't have a length (e.g. 304 redirects, W3C logs)
1318
                hit.length = 0
1319

    
1320
            try:
1321
                hit.generation_time_milli = float(format.get('generation_time_milli'))
1322
            except (ValueError, BaseFormatException):
1323
                try:
1324
                    hit.generation_time_milli = float(format.get('generation_time_micro')) / 1000
1325
                except (ValueError, BaseFormatException):
1326
                    try:
1327
                        hit.generation_time_milli = float(format.get('generation_time_secs')) * 1000
1328
                    except (ValueError, BaseFormatException):
1329
                        hit.generation_time_milli = 0
1330

    
1331
            try:
1332
                hit.host = format.get('host').lower().strip('.')
1333
                if hit.host.startswith('"'):
1334
                    hit.host = hit.host[1:-1]
1335
            except BaseFormatException:
1336
                # Some formats have no host.
1337
                pass
1338

    
1339
            # Add userid
1340
            try:
1341
                hit.userid = None
1342
                userid = format.get('userid')
1343
                if userid != '-':
1344
                    hit.args['uid'] = hit.userid = userid
1345
            except:
1346
                pass
1347

    
1348
            # add event info
1349
            try:
1350
                hit.event_category = hit.event_action = hit.event_name = None
1351

    
1352
                hit.event_category = format.get('event_category')
1353
                hit.event_action = format.get('event_action')
1354

    
1355
                hit.event_name = format.get('event_name')
1356
                if hit.event_name == '-':
1357
                    hit.event_name = None
1358
            except:
1359
                pass
1360

    
1361
            # Check if the hit must be excluded.
1362
            if not all((method(hit) for method in self.check_methods)):
1363
                continue
1364

    
1365
            # Parse date.
1366
            # We parse it after calling check_methods as it's quite CPU hungry, and
1367
            # we want to avoid that cost for excluded hits.
1368
            date_string = format.get('date')
1369

    
1370
            try:
1371
                hit.date = datetime.datetime.strptime(date_string, format.date_format)
1372
            except ValueError as e:
1373
                invalid_line(line, 'invalid date or invalid format: %s' % str(e))
1374
                continue
1375

    
1376
            # Parse timezone and substract its value from the date
1377
            try:
1378
                timezone = float(format.get('timezone'))
1379
            except BaseFormatException:
1380
                timezone = 0
1381
            except ValueError:
1382
                invalid_line(line, 'invalid timezone')
1383
                continue
1384

    
1385
            if timezone:
1386
                hit.date -= datetime.timedelta(hours=timezone/100)
1387

    
1388
            (is_filtered, reason) = self.is_filtered(hit)
1389
            if is_filtered:
1390
                filtered_line(line, reason)
1391
                continue
1392
            if (not hit.is_robot) and (hit.is_meta or hit.is_download) and (not hit.is_redirect):
1393
                hits.append(hit)
1394
            if (not hit.is_robot and not hit.is_redirect and hit.is_meta):
1395
                stats.count_lines_static.increment()
1396
            if (not hit.is_robot and not hit.is_redirect and hit.is_download):
1397
                stats.count_lines_downloads.increment()
1398

    
1399
            #else:
1400
            # f.write("not pass "+ hit.full_path +" "+hit.user_agent+'\n')
1401
            if len(hits) >= config.options['Matomo_Parameters']['max_payload'] * len(Recorder.recorders):
1402
                Recorder.add_hits(hits)
1403
                hits = []
1404
        # add last chunk of hits
1405
        if len(hits) > 0:
1406
            Recorder.add_hits(hits)
1407

    
1408

    
1409
class Statistics(object):
1410
    """
1411
    Store statistics about parsed logs and recorded entries.
1412
    Can optionally print statistics on standard output every second.
1413
    """
1414

    
1415
    class Counter(object):
1416
        """
1417
        Simple integers cannot be used by multithreaded programs. See:
1418
        http://stackoverflow.com/questions/6320107/are-python-ints-thread-safe
1419
        """
1420
        def __init__(self):
1421
            # itertools.count's implementation in C does not release the GIL and
1422
            # therefore is thread-safe.
1423
            self.counter = itertools.count(1)
1424
            self.value = 0
1425

    
1426
        def increment(self):
1427
            self.value = self.counter.__next__()
1428

    
1429
        def advance(self, n):
1430
            for i in range(n):
1431
                self.increment()
1432

    
1433
        def __str__(self):
1434
            return str(int(self.value))
1435

    
1436
    def __init__(self):
1437
        self.time_start = None
1438
        self.time_stop = None
1439

    
1440
        self.count_lines_parsed = self.Counter()
1441
        self.count_lines_recorded = self.Counter()
1442

    
1443
        # requests that the Matomo tracker considered invalid (or failed to track)
1444
        self.invalid_lines = []
1445

    
1446
        # Do not match the regexp.
1447
        self.count_lines_invalid = self.Counter()
1448
        # Were filtered out.
1449
        self.count_lines_filtered = self.Counter()
1450
        # Static files.
1451
        self.count_lines_static = self.Counter()
1452
        # Ignored user-agents.
1453
        self.count_lines_skipped_user_agent = self.Counter()
1454
        # Downloads
1455
        self.count_lines_downloads = self.Counter()
1456

    
1457
        # Misc
1458
        self.dates_recorded = set()
1459
        self.monitor_stop = False
1460

    
1461
    def set_time_start(self):
1462
        self.time_start = time.time()
1463

    
1464
    def set_time_stop(self):
1465
        self.time_stop = time.time()
1466

    
1467
    def _compute_speed(self, value, start, end):
1468
        delta_time = end - start
1469
        if value == 0:
1470
            return 0
1471
        if delta_time == 0:
1472
            return 'very high!'
1473
        else:
1474
            return value / delta_time
1475

    
1476
    def _round_value(self, value, base=100):
1477
        return round(value * base) / base
1478

    
1479
    def _indent_text(self, lines, level=1):
1480
        """
1481
        Return an indented text. 'lines' can be a list of lines or a single
1482
        line (as a string). One level of indentation is 4 spaces.
1483
        """
1484
        prefix = ' ' * (4 * level)
1485
        if isinstance(lines, str):
1486
            return prefix + lines
1487
        else:
1488
            return '\n'.join(
1489
                prefix + line
1490
                for line in lines
1491
            )
1492

    
1493
    def print_summary(self):
1494
        invalid_lines_summary = ''
1495
        if self.invalid_lines:
1496
            invalid_lines_summary = '''Invalid log lines
1497
-----------------
1498

    
1499
The following lines were not tracked by Matomo, either due to a malformed tracker request or error in the tracker:
1500

    
1501
%s
1502

    
1503
''' % textwrap.fill(", ".join(self.invalid_lines), 80)
1504

    
1505
        print('''
1506
%(invalid_lines)sLogs import summary
1507
-------------------
1508

    
1509
    %(count_lines_recorded)d requests imported successfully
1510
    %(count_lines_downloads)d requests were downloads
1511
    %(count_lines_metadata)d requests were metadata
1512
    %(count_lines_skipped_user_agent)d requests ignored done by bots, search engines...
1513

    
1514
Performance summary
1515
-------------------
1516

    
1517
    Total time: %(total_time)d seconds
1518
    Requests imported per second: %(speed_recording)s requests per second
1519

    
1520

    
1521
''' % {
1522

    
1523
    'count_lines_recorded': self.count_lines_recorded.value,
1524
    'count_lines_downloads': self.count_lines_downloads.value,
1525
    'count_lines_metadata': self.count_lines_static.value,
1526
    'count_lines_skipped_user_agent': self.count_lines_skipped_user_agent.value,
1527
    'total_time': self.time_stop - self.time_start,
1528
    'speed_recording': self._round_value(self._compute_speed(
1529
            self.count_lines_recorded.value,
1530
            self.time_start, self.time_stop,
1531
        )),
1532
    'invalid_lines': invalid_lines_summary
1533
})
1534

    
1535
    ##
1536
    ## The monitor is a thread that prints a short summary each second.
1537
    ##
1538

    
1539
    def _monitor(self):
1540
        latest_total_recorded = 0
1541
        while not self.monitor_stop:
1542
            current_total = stats.count_lines_recorded.value
1543
            time_elapsed = time.time() - self.time_start
1544

    
1545
            print('%d lines parsed, %d lines recorded, %d records/sec (avg), %d records/sec (current)' % (
1546
                stats.count_lines_parsed.value,
1547
                current_total,
1548
                current_total / time_elapsed if time_elapsed != 0 else 0,
1549
                current_total - latest_total_recorded,
1550
            ))
1551
            latest_total_recorded = current_total
1552
            logging.info('%d lines parsed, %d lines recorded, %d records/sec (avg), %d records/sec (current)' % (
1553
                stats.count_lines_parsed.value,
1554
                current_total,
1555
                current_total / time_elapsed if time_elapsed != 0 else 0,
1556
                current_total - latest_total_recorded,
1557
            ))
1558
            time.sleep(1)
1559

    
1560
    def start_monitor(self):
1561
        t = threading.Thread(target=self._monitor)
1562
        t.daemon = True
1563
        t.start()
1564

    
1565
    def stop_monitor(self):
1566
        self.monitor_stop = True
1567

    
1568

    
1569
def main():
1570
    """
1571
    Start the importing process.
1572
    """
1573
    stats.set_time_start()
1574
    ''''
1575
    if config.options.show_progress:
1576
        stats.start_monitor()
1577
    '''
1578
    stats.start_monitor()
1579
    #recorders = Recorder.launch(config.options.recorders)
1580
    recorders = Recorder.launch(config.options["Matomo_Parameters"]["recorders"])
1581

    
1582
    try:
1583
        for filename in config.filenames:
1584
            parser.parse(filename)
1585
            logging.info("Reading..."+filename)
1586

    
1587
        Recorder.wait_empty()
1588
    except KeyboardInterrupt:
1589
        pass
1590

    
1591
    stats.set_time_stop()
1592
    '''
1593
    if config.options.show_progress:
1594
        stats.stop_monitor()
1595
    '''
1596
    stats.stop_monitor()
1597
    stats.print_summary()
1598

    
1599
def fatal_error(error, filename=None, lineno=None):
1600
    #print >> sys.stderr, 'Fatal error: %s' % error
1601
    sys.stderr.write("fatal error\n")
1602
    sys.stderr.write(error)
1603
    if filename and lineno is not None:
1604
        sys.stderr.write('You can restart the import of "%s" from the point it failed by specifying --skip=%d on the command line.\n' % (filename, lineno))
1605
        #print >> sys.stderr, (
1606
        #    'You can restart the import of "%s" from the point it failed by '
1607
        #    'specifying --skip=%d on the command line.\n' % (filename, lineno)
1608
        #)
1609
    os._exit(1)
1610

    
1611
if __name__ == '__main__':
1612
    try:
1613
        config = Configuration()
1614
        checkRobots = CheckRobots()
1615
        matomo = Matomo()
1616
        stats = Statistics()
1617
        parser = Parser()
1618
        main()
1619
        sys.exit(0)
1620
    except KeyboardInterrupt:
1621
        pass
(3-3/3)