|
|
- #!/bin/env python
-
- # Simple Apache HTTPD log file parser
- # Copyright (C) 2022 Pekka Helenius <pekka [dot] helenius [at] fjordtek [dot] com>
- #
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <https://www.gnu.org/licenses/>.
-
- ################################################################
-
- # TODO: prev_host: instead of comparing to previous entry, check if such IP has been seen in XXX seconds
- # TODO: store IP values for temporary list for XXX seconds, and check list values
- # TODO: implement warning check for geoiplookup tool database files, i.e. "warning, some geo database files are very old. Please consider updating geo database information."
-
- import argparse
- import os
- import re
- import subprocess
-
- from datetime import datetime
- from apachelogs import LogParser, InvalidEntryError
-
- class text_processing(object):
-
- """
- Init
- """
- def __init__(self, verbose):
- self.show_verbose = verbose
-
- """
- Verbose output format (we do not use logger library)
- """
-
- def print_verbose(self, prefix='output', *args):
- if self.show_verbose:
- print('VERBOSE [{:s}]: {:s}'.format(prefix, ', '.join([str(i) for i in args])))
-
- class program(object):
-
- """
- Init
- """
- def __init__(self):
- self.args = self.get_args()
-
- # Exclude private IP address classes from geo lookup process
- # Strip out %I and %O flags from Apache log format
- # 127.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
- self.private_class_ip_networks = ['^127\.', '^172\.(1[6-9]{1}|2[0-9]{1}|3[0-1]{1})\.', '^192\.168\.']
-
- self.txt = text_processing(verbose = self.args.verbose)
-
- """
- Define & get output fields
- """
- def get_out_fields(self):
- out_fields = {
- 'log_file_name': {'data': None, 'format': '{:s}', 'included': False, 'human_name': 'Log file name', 'sort_index': 0},
- 'http_status': {'data': None, 'format': '{:3s}', 'included': True, 'human_name': 'Status', 'sort_index': 1},
- 'remote_host': {'data': None, 'format': '{:15s}', 'included': True, 'human_name': 'Remote IP', 'sort_index': 2},
- 'country': {'data': None, 'format': '{:20s}', 'included': False, 'human_name': 'Country', 'sort_index': 3},
- 'city': {'data': None, 'format': '{:15s}', 'included': False, 'human_name': 'City', 'sort_index': 4},
- 'time': {'data': None, 'format': '{:20s}', 'included': True, 'human_name': 'Date/Time', 'sort_index': 5},
- 'time_diff': {'data': None, 'format': '{:8s}', 'included': True, 'human_name': 'Time diff', 'sort_index': 6},
- 'user_agent': {'data': None, 'format': '{:s}', 'included': True, 'human_name': 'User agent', 'sort_index': 7},
- 'http_request': {'data': None, 'format': '{:s}', 'included': True, 'human_name': 'Request', 'sort_index': 8}
- }
- return out_fields
-
- """
- Get default Apache HTTPD configuration file location
- """
- def get_apache_conf_path(self):
-
- path = None
- os_data_file = '/etc/os-release'
- conf_path = [
- { 'os_check_file': os_data_file, 'os_like': 'Arch Linux', 'path': '/etc/httpd/conf/httpd.conf'},
- { 'os_check_file': os_data_file, 'os_like': 'Debian', 'path': '/etc/apache2/apache2.conf'},
- { 'os_check_file': os_data_file, 'os_like': 'Ubuntu', 'path': '/etc/apache2/apache2.conf'},
- { 'os_check_file': os_data_file, 'os_like': 'Linux Mint', 'path': '/etc/apache2/apache2.conf'},
- { 'os_check_file': os_data_file, 'os_like': 'openSUSE', 'path': '/etc/apache2/httpd.conf'},
- { 'os_check_file': os_data_file, 'os_like': 'Gentoo', 'path': '/etc/apache2/httpd.conf'},
- { 'os_check_file': os_data_file, 'os_like': 'Red Hat', 'path': '/etc/httpd/conf/httpd.conf'},
- { 'os_check_file': os_data_file, 'os_like': 'Fedora', 'path': '/etc/httpd/conf/httpd.conf'}
- ]
-
- if self.check_file(os_data_file, "os.R_OK"):
- with open(os_data_file, 'r') as f:
- for line in f:
- if re.match('^[ ]?NAME=\"', line):
- for a in conf_path:
- if re.match('.*' + a['os_like'] + '.*', line):
- path = a['path']
- return path
- return path
-
- """
- Argument parser
- """
- def get_args(self):
-
- all_fields = self.get_out_fields()
- incl_fields = [i for i in all_fields.keys() if all_fields[i]['included']]
- out_time_format = "%d-%m-%Y %H:%M:%S"
-
- argparser = argparse.ArgumentParser(
- description = 'Apache HTTPD server log parser',
- formatter_class = argparse.ArgumentDefaultsHelpFormatter
- )
-
- argparser.add_argument(
- '-fr', '--files-regex',
- help = 'Apache log files matching input regular expression.',
- nargs = '?',
- dest = 'files_regex',
- required = False
- )
- argparser.add_argument(
- '-f', '--files-list',
- help = 'Apache log files.\nRegular expressions supported.',
- nargs = '?',
- type = lambda x: [i for i in x.split(',')],
- dest = 'files_list',
- required = False
- )
- argparser.add_argument(
- '-c', '--status-codes',
- help = 'Print only these numerical status codes.\nRegular expressions supported.',
- nargs = '+',
- dest = 'codes',
- required = False
- )
- argparser.add_argument(
- '-cf', '--countries',
- help = 'Include only these countries.\nNegative match (exclude): "\!Country"',
- nargs = '?',
- type = lambda x: [i for i in x.split(',')],
- dest = 'countries',
- required = False
- )
- argparser.add_argument(
- '-tf', '--time-format',
- help = 'Output time format.',
- nargs = '?',
- dest = 'time_format',
- )
- argparser.add_argument(
- '-if', '--included-fields',
- help = 'Included fields.\nAll fields: all, ' + ', '.join(all_fields),
- nargs = '?',
- dest = 'incl_fields',
- type = lambda x: [i for i in x.split(',')],
- default = ','.join(incl_fields)
- )
- argparser.add_argument(
- '-ef', '--excluded-fields',
- help = 'Excluded fields.',
- nargs = '?',
- dest = 'excl_fields',
- type = lambda x: [i for i in x.split(',')],
- default = None
- )
- argparser.add_argument(
- '-gl', '--geo-location',
- help = 'Check origin countries with external "geoiplookup" tool.\nNOTE: Automatically includes "country" and "city" fields.',
- action = 'store_true',
- dest = 'use_geolocation'
- )
- argparser.add_argument(
- '-ge', '--geotool-exec',
- help = '"geoiplookup" tool executable found in PATH.',
- nargs = '?',
- dest = 'geotool_exec',
- default = 'geoiplookup'
- )
- argparser.add_argument(
- '-gd', '--geo-database-dir',
- help = 'Database file directory for "geoiplookup" tool.',
- nargs = '?',
- dest = 'geo_database_location',
- default = '/usr/share/GeoIP/'
- )
- argparser.add_argument(
- '-dl', '--day-lower',
- help = 'Do not check log entries older than this day.\nDay syntax: 31-12-2020',
- nargs = '?',
- dest = 'date_lower'
- )
- argparser.add_argument(
- '-du', '--day-upper',
- help = 'Do not check log entries newer than this day.\nDay syntax: 31-12-2020',
- nargs = '?',
- dest = 'date_upper'
- )
- argparser.add_argument(
- '-sb', '--sort-by',
- help = 'Sort by an output field.',
- nargs = '?',
- dest = 'sortby_field'
- )
- argparser.add_argument(
- '-ro', '--reverse',
- help = 'Sort in reverse order.',
- dest = 'sortby_reverse',
- action = 'store_true'
- )
- argparser.add_argument(
- '-st', '--show-stats',
- help = 'Show short statistics at the end.',
- action = 'store_true',
- dest = 'show_stats'
- )
- argparser.add_argument(
- '-p', '--show-progress',
- help = 'Show progress information.',
- dest = 'show_progress',
- action = 'store_true'
- )
- argparser.add_argument(
- '--httpd-conf-file',
- help = 'Apache HTTPD configuration file with LogFormat directive.',
- dest = 'httpd_conf_file',
- default = self.get_apache_conf_path(),
- nargs = '?',
- type = str
- )
- argparser.add_argument(
- '--httpd-log-nickname',
- help = 'LogFormat directive nickname',
- action = 'store_true',
- dest = 'httpd_log_nickname',
- default = 'combinedio'
- )
- argparser.add_argument(
- '-lf', '--log-format',
- help = 'Log format, manually defined.',
- dest = 'log_format',
- required = False
- )
- argparser.add_argument(
- '-ph', '--print-header',
- help = 'Print column headers.',
- dest = 'column_headers',
- required = False,
- action = 'store_true'
- )
- argparser.add_argument(
- '--output-format',
- help = 'Output format for results.',
- dest = 'output_format',
- required = False,
- default = 'table',
- choices = ['table', 'csv']
- )
- argparser.add_argument(
- '--head',
- help = 'Read first N lines from all log entries.',
- dest = 'read_first_lines_num',
- required = False,
- nargs = '?',
- type = int
- )
- argparser.add_argument(
- '--tail',
- help = 'Read last N lines from all log entries.',
- dest = 'read_last_lines_num',
- required = False,
- nargs = '?',
- type = int
- )
- argparser.add_argument(
- '--sort-logs-by',
- help = 'Sorting order for input log files.',
- dest = 'sort_logs_by_info',
- required = False,
- default = 'name',
- choices = ['date', 'size', 'name']
- )
- argparser.add_argument(
- '--verbose',
- help = 'Verbose output.',
- dest = 'verbose',
- required = False,
- action = 'store_true'
- )
- args = argparser.parse_args()
- return args
-
- """
- Populate recognized HTTP status codes
- """
- def populate_status_codes(self):
-
- http_valid_codes = [
- '100-103',
- '200-208',
- '218'
- '226',
- '300-308',
- '400-431',
- '451',
- '500-511'
- ]
- codes = []
- for code in http_valid_codes:
- if len(code.split('-')) == 2:
- code_start = int(code.split('-')[0])
- code_end = int(code.split('-')[1])
- for i in range(code_start,code_end):
- codes.append(str(i))
- else:
- codes.append(code)
-
- return codes
-
- """
- Get valid HTTP status codes from user input
- """
- def get_input_status_codes(self, valid_codes, user_codes):
-
- codes = []
-
- for user_code in user_codes:
- user_code = str(user_code)
- validated = False
- code_appended = False
-
- for valid_code in valid_codes:
-
- if re.search(user_code, valid_code):
- validated = True
- code_appended = True
- codes.append((valid_code, validated))
- else:
- validated = False
- if not code_appended:
- codes.append((user_code, validated))
-
- self.txt.print_verbose('Available status codes', codes)
-
- return codes
-
- """
- Get log file list
- """
- def get_files(self, files_regex = None, files_list = None):
-
- files = []
-
- if files_regex is None and files_list is None:
- raise Exception("Either single file or regex file selection method is required.")
-
- if files_regex and files_list:
- raise Exception("Single file and regex file selection methods are mutually exclusive.")
-
- if files_regex:
- log_dir = '/'.join(files_regex.split('/')[:-1])
- file_part = files_regex.split('/')[-1]
- for lfile in os.listdir(log_dir):
- if os.path.isfile(log_dir + '/' + lfile):
- if re.match(file_part, lfile):
- files.append(log_dir + '/' + lfile)
-
- if files_list:
- for lfile in files_list:
- if os.path.isfile(lfile):
- files.append(lfile)
-
- if len(files) == 0:
- raise Exception("No matching files found.")
-
- files.sort()
-
- self.txt.print_verbose('Input files', files)
- return files
-
- """
- Common file checker
- """
- def check_file(self, sfile, flag, env = None):
-
- file_path = sfile
-
- if env is not None:
- for path in os.environ[env].split(os.pathsep):
- file_path = os.path.join(path, sfile)
- if os.path.isfile(file_path):
- break
-
- if os.access(file_path, eval(flag)):
- try:
- self.txt.print_verbose('File check', file_path, 'flags: ' + flag)
- except AttributeError:
- pass
- return True
- return False
-
- """
- Get Apache HTTPD LogFormat directive syntax
- """
- def get_httpd_logformat_directive(self, cfile, tag = None):
-
- if not self.check_file(cfile, "os.R_OK"):
- raise Exception("Couldn't open Apache HTTPD configuration file '{:s}'.".format(cfile))
-
- log_format = None
- self.txt.print_verbose('Apache configuration file', cfile)
- with open(cfile, 'r') as f:
- for line in f:
- if re.search('^[ ]+LogFormat ".*' + tag, line):
- r = re.search('^[ ]+LogFormat "(.*)(!?("))', line)
- log_format = r.groups()[0].replace('\\', '')
- break
- f.close()
- self.txt.print_verbose('Log format', log_format)
- return log_format
-
- """
- Geotool processing
- """
- def geotool_get_data(self, geotool_ok, geotool_exec, database_file, remote_host):
-
- host_country = None
- host_city = None
-
- if re.match('|'.join(self.private_class_ip_networks), remote_host):
- host_country = "Local"
- host_city = "Local"
- return {
- 'host_country': host_country,
- 'host_city': host_city
- }
-
- if geotool_ok:
-
- host_country_main = subprocess.check_output([geotool_exec,'-d', database_file, remote_host]).rstrip().decode()
- host_country_main = host_country_main.split('\n')
-
- try:
- host_country = host_country_main[0].split(', ')[1]
- except:
- if re.search("Address not found", host_country_main[0]):
- host_country = "Unknown"
-
- if len(host_country_main) > 1:
- try:
- host_city = host_country_main[1].split(', ')[4]
- if re.search("N/A", host_city):
- host_city = "Unknown: " + host_country_main[1].split(', ')[6] + ', ' + host_country_main[1].split(', ')[7]
- except:
- pass
-
- return {
- 'host_country': host_country,
- 'host_city': host_city
- }
- return None
-
- """
- Status code filter
- """
- def filter_status_code(self, status_codes, final_status):
-
- skip_line = True
-
- for status in status_codes:
-
- # Status consists of numerical status value (num) and validity boolean value (num_ok)
- if len(status) != 2:
- continue
-
- num, num_ok = status
-
- if num_ok:
- status = int(num)
-
- if status == final_status:
- skip_line = False
- break
-
- return skip_line
-
- """
- Country name filter
- """
- def filter_country(self, countries, host_country):
-
- skip_line = True
-
- for country in countries:
- if country[1] == "!":
- country = country[2:]
- if country.lower() == host_country.lower():
- skip_line = True
- break
- else:
- skip_line = False
-
- elif country.lower() == host_country.lower():
- skip_line = False
- break
-
- return skip_line
-
- """
- Get lines to be processed from input files and min/max input
- min and max work much like Unix tools 'head' and 'tail'
- Only a single value (min or max) is allowed
- """
-
- def get_file_lines_head_tail(self, sfiles, line_range_min = None, line_range_max = None, files_order = None):
-
- files_and_lines = {'files': [], 'lines_total': 0, 'range_min': 0, 'range_max': 0}
- files_tmp = []
-
- lines_count = 0
- line_start = 0
- line_end = 0
-
- if line_range_min and line_range_max:
- raise Exception("Either first or last line limit can be used, not both.")
-
- if files_order is None:
- raise Exception("Sorting order for input files missing.")
-
- if line_range_min is not None:
- if line_range_min < 0:
- line_range_min = None
-
- if line_range_max is not None:
- if line_range_max < 0:
- line_range_max = None
-
- for sfile in sfiles:
-
- if not self.check_file(sfile, "os.R_OK"):
- raise Exception("Couldn't read input file '{}'.".format(sfile))
-
- with open(sfile, 'r') as f:
- line_count = len(list(f))
- f.close()
-
- files_tmp.append({
- 'file': str(sfile),
- 'modified_date': os.path.getmtime(sfile),
- 'size': os.path.getsize(sfile),
- 'line_count': line_count
- })
-
- if files_order == 'date':
- files_tmp.sort(key = lambda d: d['modified_date'])
- elif files_order == 'size':
- files_tmp.sort(key = lambda d: d['size'])
- elif files_order == 'name':
- files_tmp.sort(key = lambda d: d['file'])
-
- i = 0
- for sfile in files_tmp:
-
- line_end = (line_start + sfile['line_count']) - 1
-
- files_and_lines['files'].append({
- 'file': sfile['file'],
- 'line_start_global': line_start,
- 'line_end_global': line_end,
- 'line_start_local': 0,
- 'line_end_local': sfile['line_count'] - 1,
- })
-
- lines_count += line_count
- line_start = files_and_lines['files'][i]['line_end_global'] + 1
- i += 1
-
- range_line_start = files_and_lines['files'][0]['line_start_global']
- full_range = files_and_lines['files'][-1]['line_end_global']
- files_and_lines['range_min'] = range_line_start
- files_and_lines['range_max'] = full_range
- files_and_lines['lines_total'] = full_range - range_line_start
- i = 0
-
- # Read last N lines
- if line_range_max is not None:
- range_start = full_range - line_range_max
- if range_start <= 0:
- range_start = 0
-
- for l in files_and_lines['files']:
- if range_start >= l['line_start_global'] and range_start <= l['line_end_global']:
- l['line_start_global'] = range_start
- l['line_start_local'] = l['line_end_local'] - (l['line_end_global'] - range_start)
- del files_and_lines['files'][:i]
- i += 1
-
- # Read first N lines
- if line_range_min is not None:
- range_end = line_range_min
- if range_end >= full_range:
- range_end = full_range
-
- for l in files_and_lines['files']:
- if range_end >= l['line_start_global'] and range_end <= l['line_end_global']:
- l['line_end_local'] = l['line_end_local'] - l['line_start_local'] - (l['line_end_global'] - range_end)
- l['line_end_global'] = range_end
- del files_and_lines['files'][i + 1:]
- i += 1
-
- return files_and_lines
-
- """
- Get lines to be processed from input files and range input
- Range: <min> - <max>
- """
-
- def get_file_lines_range(self, sfiles, line_range_min=None, line_range_max=None):
-
- files_and_lines = {'files': [], 'lines_total': 0, 'range_min': 0, 'range_max': 0}
-
- lines_count = 0
- line_start = 0
- line_end = 0
- range_line_start = 0
- range_line_end = 0
- range_line_start_found = False
-
- if line_range_min is not None:
- if line_range_min < 0:
- line_range_min = None
-
- if line_range_max is not None:
- if line_range_max < 0:
- line_range_max = None
-
- for sfile in sfiles:
- append = False
-
- if not self.check_file(sfile, "os.R_OK"):
- raise Exception("Couldn't read input file '{}'.".format(sfile))
-
- with open(sfile, 'r') as f:
- line_count = len(list(f))
- f.close()
-
- line_end = line_start + line_count
-
- if line_range_min is not None:
- if line_range_min >= line_start and line_range_min <= line_end:
- append = True
- line_start = line_range_min
- if line_range_min is None and line_end < line_range_max:
- append = True
-
- if line_range_max is not None:
- if line_range_max >= line_start and line_range_max <= line_end:
- append = True
- line_end = line_range_max
- if line_range_min < line_end and line_range_max > line_end:
- append = True
- if line_range_max is None and line_start > line_range_min:
- append = True
-
- if append:
- files_and_lines['files'].append({
- 'file': str(sfile),
- 'line_start_global': line_start,
- 'line_end_global': line_end,
- 'modified_date': os.path.getmtime(sfile),
- 'size': os.path.getsize(sfile)
- })
-
- # Use only the first matching line_start value
- if not range_line_start_found:
- range_line_start_found = True
- range_line_start = line_start
- # Use the last matching line_end value
- range_line_end = line_end
-
- lines_count += line_count
- line_start = lines_count + 1
-
- files_and_lines['lines_total'] = range_line_end - range_line_start
- files_and_lines['range_min'] = range_line_start
- files_and_lines['range_max'] = range_line_end
-
- return files_and_lines
-
- """
- Date checker
- """
- def date_checker(self, date_lower, date_upper, entry_time):
-
- # TODO Handle situations where date_upper & date_lower are equal
-
- if date_upper is not None and date_lower is not None:
- if date_lower > date_upper:
- raise Exception("Earlier day can't be later than later day")
-
- if date_upper is not None:
- if date_upper > datetime.now():
- raise Exception("Day can't be in the future")
-
- if date_lower is not None:
- if date_lower > datetime.now():
- raise Exception("Day can't be in the future")
-
- if date_lower is not None:
- if entry_time <= date_lower: return False
-
- if date_upper is not None:
- if entry_time >= date_upper: return False
-
- return True
-
- """
- Get output field definitions (sortby)
- """
- def get_out_field(self, fields, field_input):
-
- i = 0
- for field in fields:
- if field == field_input:
- return [True, i]
- i += 1
- return [False, i]
-
- """
- Get included fields
- """
- def get_included_fields(self, fields, included_fields, excluded_fields=None):
-
- if included_fields:
-
- # TODO: simplify logic
- n = 0
- included_fields = [[i.replace(' ',''), 0] for i in included_fields]
- for a in included_fields:
- a[1] += n
- n += 1
- if excluded_fields:
- excluded_fields = [i.replace(' ','') for i in excluded_fields]
-
- all_defined_fields = []
- fields_out = {}
-
- if 'all' in included_fields or included_fields is None:
- included_fields = [[i, int(i['sort_index'])] for i in fields.keys()]
-
- if excluded_fields is not None:
- if 'all' in excluded_fields:
- raise Exception("No output fields defined.")
-
- # TODO: simplify logic
- n = 0
- included_fields = [[i, 0] for i in included_fields if i not in excluded_fields]
- for a in included_fields:
- a[1] += n
- n += 1
- all_defined_fields = [i[0] for i in included_fields] + excluded_fields
- else:
- all_defined_fields = included_fields
-
- for i in all_defined_fields:
- if i[0] not in fields.keys():
- raise Exception("Unknown field value: {}. Accepted values: {}".format(i, ','.join(fields.keys())))
-
- for a in included_fields:
- for key, value in fields.items():
- if key == a[0]:
- value['sort_index'] = a[1]
- value['included'] = True
- fields_out[key] = value
-
- if len(fields_out.keys()) == 0:
- raise Exception("No output fields defined.")
-
- return fields_out
-
- """
- Process input files
- """
- def process_files(self):
-
- prev_host = ""
- log_entries = []
- codes = []
- countries = []
-
- # Log format as defined in Apache/HTTPD configuration file (LogFormat directive) or manually by user
- if self.args.log_format:
- log_format = self.args.log_format
- else:
- log_format = self.get_httpd_logformat_directive(self.args.httpd_conf_file, self.args.httpd_log_nickname)
-
- # Remove bytes in & out fields from local traffic pattern
- log_format_local = log_format.replace('%I','').replace('%O','').strip()
-
- parser = LogParser(log_format)
- parser_local = LogParser(log_format_local)
-
- if self.args.codes:
- codes = self.get_input_status_codes(self.populate_status_codes(), self.args.codes)
-
- if self.args.countries:
- countries = self.args.countries
-
- date_lower = self.args.date_lower
- date_upper = self.args.date_upper
- day_format = "%d-%m-%Y"
-
- if date_lower is not None:
- date_lower = datetime.strptime(date_lower, day_format)
- if date_upper is not None:
- date_upper = datetime.strptime(date_upper, day_format)
-
- geotool_exec = self.args.geotool_exec
- geo_database_location = self.args.geo_database_location
-
- incl_fields = self.args.incl_fields
- if isinstance(self.args.incl_fields, str):
- incl_fields = self.args.incl_fields.split(',')
-
- use_geolocation = self.args.use_geolocation
- geotool_ok = False
-
- if use_geolocation:
- if self.check_file(geotool_exec, "os.X_OK", "PATH") and self.check_file(geo_database_location, "os.R_OK"):
- geotool_ok = True
-
- if use_geolocation:
- if 'country' not in incl_fields:
- incl_fields.append('country')
- if 'city' not in incl_fields:
- incl_fields.append('city')
-
- if 'country' in incl_fields or 'city' in incl_fields:
- use_geolocation = True
-
- fields = self.get_included_fields(
- self.get_out_fields(),
- incl_fields,
- self.args.excl_fields
- )
-
- invalid_lines = []
- field_names = []
- country_seen = False
- geo_data = None
- skip_line_by_status = False
- skip_line_by_country = False
- file_num = 0
- stri = ""
-
- files_input = self.get_files(self.args.files_regex, self.args.files_list)
- files_process_data = self.get_file_lines_head_tail(
- files_input,
- self.args.read_first_lines_num,
- self.args.read_last_lines_num,
- self.args.sort_logs_by_info
- )
-
- lines_total = files_process_data['lines_total']
- files_total = len(files_process_data['files'])
-
- self.txt.print_verbose(
- 'Log entry range',
- str(files_process_data['files'][0]['line_start_global'])
- + ' - ' +
- str(files_process_data['files'][-1]['line_end_global'])
- )
-
- if self.args.show_progress or self.args.verbose:
- print(
- "File count: {}\nLines in total: {}".format(
- str(files_total),
- str(lines_total)
- ))
-
- for lfile in files_process_data['files']:
-
- if self.args.show_progress or self.args.verbose:
- print("Processing file: {:s} (lines: {:d}-{:d})".format(
- lfile['file'],
- lfile['line_start_global'], lfile['line_end_global']
- ))
-
- if not self.check_file(lfile['file'], "os.R_OK"):
- raise Exception("Couldn't read input file '{}'.".format(lfile['file']))
-
- with open(lfile['file'], 'r') as f:
- f = list(f)
- range_start = files_process_data['files'][file_num]['line_start_local']
- range_end = files_process_data['files'][file_num]['line_end_local']
-
- lines = range(range_start, range_end)
- line_num = 1
-
- for line in lines:
-
- if self.args.show_progress or self.args.verbose:
- print("Processing log entry: {:d}/{:d} ({}%)".format(
- line_num,
- len(lines),
- round(100 * (line_num/len(lines)), 2)
- ), end = "\r")
-
- if line_num != 1 and not (skip_line_by_status or skip_line_by_country) and entry_data:
- prev_host = entry_data['remote_host']
- prev_host_time = entry_data['time']
-
- try:
- if re.match('|'.join(self.private_class_ip_networks), f[line]):
- entry = parser_local.parse(f[line])
- else:
- entry = parser.parse(f[line])
- except InvalidEntryError:
- invalid_lines.append((lfile['file'], line_num))
- line_num += 1
- continue
-
- entry_data = {
- 'time': entry.request_time.replace(tzinfo = None),
- 'user_agent': entry.headers_in["User-Agent"],
- 'http_request': str(entry.request_line).encode('unicode_escape').decode(),
- 'remote_host': entry.remote_host,
- 'status': entry.final_status
- }
-
- if not self.date_checker(date_lower, date_upper, entry_data['time']):
- line_num += 1
- continue
-
- if len(codes) > 0:
- skip_line_by_status = self.filter_status_code(codes, entry_data['status'])
-
- if use_geolocation:
- if prev_host == entry_data['remote_host']:
- country_seen = True
- else:
- country_seen = False
-
- if not country_seen:
- geo_data = self.geotool_get_data(geotool_ok, geotool_exec, geo_database_location, entry_data['remote_host'])
-
- if len(countries) > 0 and geo_data is not None:
- skip_line_by_country = self.filter_country(countries, geo_data['host_country'])
-
- else:
- skip_line_by_country = False
-
- if skip_line_by_status or skip_line_by_country:
- line_num += 1
- continue
-
- time_diff = str('NEW_CONN')
- if prev_host == entry_data['remote_host']:
- time_diff = (entry_data['time'] - prev_host_time).total_seconds()
- if isinstance(time_diff, float):
- time_diff = int(time_diff)
- if time_diff > 0:
- time_diff = "+" + str(time_diff)
- if line_num == 1 and file_num == 0:
- time_diff = int(0)
-
- if 'log_file_name' in fields:
- fields['log_file_name']['data'] = lfile
- if 'http_status' in fields:
- fields['http_status']['data'] = entry_data['status']
- if 'remote_host' in fields:
- fields['remote_host']['data'] = entry_data['remote_host']
-
- if geo_data is not None:
- if 'country' in fields:
- fields['country']['data'] = geo_data['host_country']
- if 'city' in fields:
- fields['city']['data'] = geo_data['host_city']
-
- if 'time' in fields:
- fields['time']['data'] = entry_data['time']
- if 'time_diff' in fields:
- fields['time_diff']['data'] = time_diff
- if 'user_agent' in fields:
- fields['user_agent']['data'] = entry_data['user_agent']
- if 'http_request' in fields:
- fields['http_request']['data'] = entry_data['http_request']
-
- stri = ""
- printargs = []
-
- for key, value in fields.items():
- if not use_geolocation and (key == 'country' or key == 'city'):
- continue
- if value['included']:
- stri += "\t" + value['format']
- printargs.append(value['data'])
-
- if not any(key in i for i in field_names):
- field_names.append((key, value['human_name']))
-
- log_entries.append(printargs)
- line_num += 1
-
- file_num += 1
-
- return [log_entries, files_process_data['files'], lines_total, stri, field_names, invalid_lines]
-
- """
- Execute
- """
- def execute(self):
-
- print_headers = self.args.column_headers
- show_progress = self.args.show_progress
- show_stats = self.args.show_stats
- output_format = self.args.output_format
-
- sortby_field = self.args.sortby_field
- reverse_order = self.args.sortby_reverse
-
- if self.args.incl_fields:
- if 'all' not in self.args.incl_fields:
- if sortby_field and sortby_field not in self.args.incl_fields:
- raise Exception("Sort-by field must be included in output fields.")
-
- results = self.process_files()
- result_entries = results[0]
- result_files = results[1]
- result_lines = results[2]
- stri = results[3]
- out_fields = [i[0] for i in results[4]]
- out_fields_human_names = [i[1] for i in results[4]]
- invalid_lines = results[5]
-
- if sortby_field is None and reverse_order:
- raise Exception("You must define a field for reverse sorting.")
-
- if sortby_field is not None:
- out_field_validation = self.get_out_field(out_fields, sortby_field)
- if out_field_validation[0]:
- result_entries.sort(
- key = lambda r : r[out_field_validation[1]] or '',
- reverse = reverse_order
- )
-
- if output_format == 'table':
-
- if print_headers:
- print("\n")
- print(stri.format(*out_fields_human_names).lstrip())
-
- for entry in result_entries:
- c = 0
- entry_items = []
- while c < len(entry):
- entry_items.append(str(entry[c]))
- c += 1
- print(stri.format(*entry_items).lstrip())
-
- if output_format == 'csv':
-
- if print_headers:
- print(','.join(out_fields_human_names))
-
- for entry in result_entries:
- c = 0
- entry_items = []
- while c < len(entry):
- entry_items.append(str(entry[c]))
- c += 1
- print(','.join(entry_items))
-
- if show_stats:
- print(("\n" +
- "Processed files: {:s}\n" +
- "Processed log entries: {:d}\n" +
- "Matched log entries: {:d}\n"
- ).format(
- ', '.join([i['file'] for i in result_files['files']]),
- result_lines,
- len(result_entries)
- )
- )
- if len(invalid_lines) > 0:
- print("Invalid lines:")
- for i in invalid_lines:
- print("\tFile: {:s}, line: {:d}".format(i[0], i[1]))
- print("\n")
-
- if __name__ == "__main__":
- app = program()
- app.execute()
|