From 27c8970d3cc5d05050492024cbb8735f0553b182 Mon Sep 17 00:00:00 2001 From: Pekka Helenius Date: Sun, 19 Jun 2022 13:33:41 +0300 Subject: [PATCH] Refactor the codebase --- apache-logparser/logparser.py | 1108 ++++++++++++++++++++++----------- 1 file changed, 744 insertions(+), 364 deletions(-) diff --git a/apache-logparser/logparser.py b/apache-logparser/logparser.py index a019a5d..37bb1b9 100644 --- a/apache-logparser/logparser.py +++ b/apache-logparser/logparser.py @@ -1,7 +1,7 @@ #!/bin/env python -# Simple Apache log parser -# Copyright (C) 2020 Pekka Helenius +# Simple Apache HTTPD log file parser +# Copyright (C) 2022 Pekka Helenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -19,398 +19,778 @@ ################################################################ # TODO prev_host: instead of comparing to previous entry, check if such IP has been seen in XXX seconds -# store IP values for temporary list for XXX seconds, and check list values +# TODO: store IP values for temporary list for XXX seconds, and check list values import argparse import os import re import subprocess -import sys + from datetime import datetime -from apachelogs import LogParser - -out_fields_list = ['log_file_name', 'http_status', 'remote_host', 'country', 'city', 'time', 'time_diff', 'user_agent', 'http_request'] -out_timeformat = "%d-%m-%Y %H:%M:%S" -dayformat = "%d-%m-%Y" -ot = '"' + re.sub(r'%', '%%', out_timeformat) + '"' -geotool = "geoiplookup" -geodb = "/usr/share/GeoIP/" - -# Log format as defined in Apache/HTTPD configuration file (LogFormat directive) -in_log_syntax = "%h %u %t %T \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" \"%{cache-status}e\" %I %O" - -argparser = argparse.ArgumentParser() - -argparser.add_argument('-d', '--dir', help = 'Apache log file directory.', nargs = '?', dest = 'log_dir', required = True) -argparser.add_argument('-f', '--files', help = 'Apache log files. Regular expressions supported.', nargs = '+', dest = 'log_file', required = True) -argparser.add_argument('-s', '--logsyntax', help = 'Apache log files syntax, defined as "LogFormat" directive in Apache configuration.', nargs = '?', dest = 'log_syntax') -argparser.add_argument('-c', '--statuscodes', help = 'Print only these status codes. Regular expressions supported.', nargs = '+', dest = 'status_code') -argparser.add_argument('-cf', '--countryfilter', help = 'Include only these countries. Negative match (exclude): "\!Country"', nargs = '+', dest = 'country') -argparser.add_argument('-ot', '--outtimeformat', help = 'Output time format.\nDefault: ' + ot, nargs = '?', dest = 'out_timeformat') -argparser.add_argument('-of', '--outfields', help = 'Output fields.\nDefault: ' + ', '.join(out_fields_list), nargs = '+', dest = 'out_field') -argparser.add_argument('-ng', '--nogeo', help = 'Skip country check with external "geoiplookup" tool.', action='store_true', dest = 'no_geo') -argparser.add_argument('-gd', '--geodir', help = 'Database file directory for "geoiplookup" tool.\nDefault: ' + geodb, nargs = '?', dest = 'geodb') -argparser.add_argument('-dl', '--daylower', help = 'Do not check log entries older than this day.\nDay syntax: 31-12-2020', nargs = '?', dest = 'day_lower') -argparser.add_argument('-du', '--dayupper', help = 'Do not check log entries newer than this day.\nDay syntax: 31-12-2020', nargs = '?', dest = 'day_upper') -argparser.add_argument('-sb', '--sortby', help = 'Sort by an output field.', nargs = '?', dest = 'sortby_field') -argparser.add_argument('-sbr', '--sortbyreverse', help = 'Sort by an output field, reverse order.', nargs = '?', dest = 'sortby_field_reverse') -argparser.add_argument('-st', '--stats', help = 'Show short statistics at the end.', action='store_true', dest = 'show_count') -argparser.add_argument('-np', '--noprogress', help = 'Do not show progress information.', action='store_true', dest = 'no_progress') -args = argparser.parse_args() - -if args.status_code is None: - status_filter = False - skip_line_1 = False -else: - status_filter = True - skip_line_1 = True - status_codes = args.status_code - - http_valid_codes = [ - '100', - '101', - '102', - '103', - '200', - '201', - '202', - '203', - '204', - '205', - '206', - '207', - '208', - '226', - '300', - '301', - '302', - '303', - '304', - '305', - '306', - '307', - '308', - '400', - '401', - '402', - '403', - '404', - '405', - '406', - '407', - '408', - '409', - '410', - '411', - '412', - '413', - '414', - '415', - '416', - '417', - '418', - '421', - '422', - '423', - '424', - '425', - '426', - '428', - '429', - '431', - '451', - '500', - '501', - '502', - '503', - '504', - '505', - '506', - '507', - '508', - '510', - '511', - '218' +from apachelogs import LogParser, InvalidEntryError + +class program(object): + + """ + Init + """ + def __init__(self): + self.get_args() + # Exclude private IP address classes from geo lookup process + # 127.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 + self.private_class_ip_networks = ['^127\.', '^172\.[1[6-9]|2[0-9]|3[0-1]]\.', '^192\.168\.'] + + """ + Define & get output fields + """ + def get_out_fields(self): + out_fields = { + 'log_file_name': {'data': None, 'format': '{:s}', 'included': False, 'human_name': 'Log file name'}, + 'http_status': {'data': None, 'format': '{:3s}', 'included': True, 'human_name': 'Status'}, + 'remote_host': {'data': None, 'format': '{:15s}', 'included': True, 'human_name': 'Remote IP'}, + 'country': {'data': None, 'format': '{:20s}', 'included': False, 'human_name': 'Country'}, + 'city': {'data': None, 'format': '{:15s}', 'included': False, 'human_name': 'City'}, + 'time': {'data': None, 'format': '{:20s}', 'included': True, 'human_name': 'Date/Time'}, + 'time_diff': {'data': None, 'format': '{:8s}', 'included': True, 'human_name': 'Time diff'}, + 'user_agent': {'data': None, 'format': '{:s}', 'included': True, 'human_name': 'User agent'}, + 'http_request': {'data': None, 'format': '{:s}', 'included': True, 'human_name': 'Request'} + } + return out_fields + + """ + Argument parser + """ + def get_args(self): + + all_fields = self.get_out_fields() + incl_fields = [i for i in all_fields.keys() if all_fields[i]['included']] + out_time_format = "%d-%m-%Y %H:%M:%S" + + argparser = argparse.ArgumentParser( + description = 'Apache HTTPD server log parser', + formatter_class = argparse.ArgumentDefaultsHelpFormatter + ) + + argparser.add_argument( + '-fr', '--files-regex', + help = 'Apache log files matching input regular expression.', + nargs = '?', + dest = 'files_regex', + required = False + ) + argparser.add_argument( + '-f', '--files-list', + help = 'Apache log files.\nRegular expressions supported.', + nargs = '?', + type = lambda x: [i for i in x.split(',')], + dest = 'files_list', + required = False + ) + argparser.add_argument( + '-c', '--status-codes', + help = 'Print only these numerical status codes.\nRegular expressions supported.', + nargs = '+', + dest = 'codes' + ) + argparser.add_argument( + '-cf', '--countries', + help = 'Include only these countries.\nNegative match (exclude): "\!Country"', + nargs = '?', + type = lambda x: [i for i in x.split(',')], + dest = 'countries' + ) + argparser.add_argument( + '-tf', '--time-format', + help = 'Output time format.', + nargs = '?', + dest = 'time_format', + default = out_time_format + ) + argparser.add_argument( + '-if', '--included-fields', + help = 'Included fields.\nAll fields: all, ' + ', '.join(all_fields), + nargs = '?', + dest = 'incl_fields', + type = lambda x: [i for i in x.split(',')], + default = ', '.join(incl_fields) + ) + argparser.add_argument( + '-ef', '--excluded-fields', + help = 'Excluded fields.', + nargs = '?', + dest = 'excl_fields', + type = lambda x: [i for i in x.split(',')], + default = None + ) + argparser.add_argument( + '-gl', '--geo-location', + help = 'Check origin countries with external "geoiplookup" tool.\nNOTE: Automatically includes "country" and "city" fields.', + action = 'store_true', + dest = 'use_geolocation' + ) + argparser.add_argument( + '-ge', '--geotool-exec', + help = '"geoiplookup" tool executable found in PATH.', + nargs = '?', + dest = 'geotool_exec', + default = "geoiplookup" + ) + argparser.add_argument( + '-gd', '--geo-database-dir', + help = 'Database file directory for "geoiplookup" tool.', + nargs = '?', + dest = 'geo_database_location', + default = '/usr/share/GeoIP/' + ) + argparser.add_argument( + '-dl', '--day-lower', + help = 'Do not check log entries older than this day.\nDay syntax: 31-12-2020', + nargs = '?', + dest = 'date_lower' + ) + argparser.add_argument( + '-du', '--day-upper', + help = 'Do not check log entries newer than this day.\nDay syntax: 31-12-2020', + nargs = '?', + dest = 'date_upper' + ) + argparser.add_argument( + '-sb', '--sort-by', + help = 'Sort by an output field.', + nargs = '?', + dest = 'sortby_field' + ) + argparser.add_argument( + '-ro', '--reverse-order', + help = 'Sort in reverse order.', + dest = 'sortby_reverse', + action = 'store_true' + ) + argparser.add_argument( + '-st', '--show-stats', + help = 'Show short statistics at the end.', + action = 'store_true', + dest = 'show_stats' + ) + argparser.add_argument( + '-p', '--show-progress', + help = 'Show progress information.', + dest = 'show_progress', + action = 'store_true' + ) + argparser.add_argument( + '--httpd-conf-file', + help = 'Apache HTTPD configuration file with LogFormat directive.', + action = 'store_true', + dest = 'httpd_conf_file', + default = '/etc/httpd/conf/httpd.conf' + ) + argparser.add_argument( + '--httpd-log-nickname', + help = 'LogFormat directive nickname', + action = 'store_true', + dest = 'httpd_log_nickname', + default = 'combinedio' + ) + argparser.add_argument( + '-lf', '--log-format', + help = 'Log format, manually defined.', + dest = 'log_format', + required = False + ) + argparser.add_argument( + '-ph', '--print-headers', + help = 'Print column headers.', + dest = 'column_headers', + required = False, + action = 'store_true' + ) + argparser.add_argument( + '--output-format', + help = 'Output format for results.', + dest = 'output_format', + required = False, + default = 'table', + choices = ['table', 'csv'] + ) + args = argparser.parse_args() + return args + + """ + Populate recognized HTTP status codes + """ + def populate_status_codes(self): + + http_valid_codes = [ + '100-103', + '200-208', + '218' + '226', + '300-308', + '400-431', + '451', + '500-511' ] + codes = [] + for code in http_valid_codes: + if len(code.split('-')) == 2: + code_start = int(code.split('-')[0]) + code_end = int(code.split('-')[1]) + for i in range(code_start,code_end): + codes.append(str(i)) + else: + codes.append(code) + + return codes + + """ + Get valid HTTP status codes from user input + """ + def get_input_status_codes(self, valid_codes, user_codes): + + codes = [] + + for user_code in user_codes: + user_code = str(user_code) + validated = False + code_appended = False + + for valid_code in valid_codes: + + if re.search(user_code, valid_code): + validated = True + code_appended = True + codes.append((valid_code, validated)) + else: + validated = False + if not code_appended: + codes.append((user_code, validated)) + + return codes + + """ + Get log file list + """ + def get_files(self, files_regex=None, files_list=None): + + files = [] + + if files_regex is None and files_list is None: + raise Exception("Either single file or regex file selection method is required.") + + if files_regex and files_list: + raise Exception("Single file and regex file selection methods are mutually exclusive.") + + if files_regex: + log_dir = '/'.join(files_regex.split('/')[:-1]) + file_part = files_regex.split('/')[-1] + for lfile in os.listdir(log_dir): + if os.path.isfile(log_dir + '/' + lfile): + if re.match(file_part, lfile): + files.append(log_dir + '/' + lfile) + + if files_list: + for lfile in files_list: + if os.path.isfile(lfile): + files.append(lfile) + + if len(files) == 0: + raise Exception("No matching files found.") + + files.sort() + return files + + """ + Common file checker + """ + def check_file(self, sfile, flag, env = None): + + file_path = sfile + + if env is not None: + for path in os.environ[env].split(os.pathsep): + file_path = os.path.join(path, sfile) + if os.path.isfile(file_path): + break + + if os.access(file_path, eval(flag)): + return True + return False - code_statuses = [] - for status_input in status_codes: - init_status = False - status_append = status_input - status_appended = False + """ + Get Apache HTTPD LogFormat directive syntax + """ + def get_httpd_logformat_directive(self, cfile, tag=None): - for status_valid in http_valid_codes: + try: + log_format = None + with open(cfile, 'r') as f: + for line in f: + if re.search('^[ ]+LogFormat ".*' + tag, line): + r = re.search('^[ ]+LogFormat "(.*)(!?("))', line) + log_format = r.groups()[0].replace('\\', '') + break + f.close() + return log_format - if re.search(status_input, status_valid): - status_append = status_valid - init_status = True - status_appended = True - code_statuses.append((status_append, init_status)) - else: - init_status = False - if not status_appended: - code_statuses.append((status_append, init_status)) - - error_msg = "" - for vl in code_statuses: - status, init_status = vl - - if not init_status: - error_msg += "Invalid status code '" + status + "' supplied\n" - - if error_msg != "": - raise Exception("\n" + error_msg) - -if args.country is None: - country_filter = False - skip_line_2 = False -else: - country_filter = True - countries_filter_list = args.country - skip_line_2 = True - -if args.out_timeformat is not None: - out_timeformat = args.out_timeformat - -if args.out_field is not None: - out_fields_list = args.out_field - -if args.day_lower is not None: - day_lower = datetime.strptime(args.day_lower, dayformat) -else: - day_lower = None -if args.day_upper is not None: - day_upper = datetime.strptime(args.day_upper, dayformat) -else: - day_upper = None - -if args.log_syntax is None: - log_syntax = in_log_syntax -else: - log_syntax = args.log_syntax - -log_dir = args.log_dir -files = args.log_file -no_progress = args.no_progress -files_tmp = [] -parser = LogParser(log_syntax) - -for file_regex in files: - for file in os.listdir(log_dir): - fullpath = log_dir + file - if os.path.isfile(fullpath): - if re.search(file_regex, file): - files_tmp.append(file) - - files_tmp.sort() - files = files_tmp - -def fileCheck(file, flag, env=None): - if env is None: - filepath = file + except: + raise Exception("Couldn't open Apache HTTPD configuration file.") + + """ + Geotool processing + """ + def geotool_get_data(self, geotool_exec, database_file, remote_host): + + host_country = None + host_city = None + + if re.match('|'.join(self.private_class_ip_networks), remote_host): + host_country = "Local" + host_city = "Local" + return { + 'host_country': host_country, + 'host_city': host_city + } + + if self.check_file(geotool_exec, "os.X_OK", "PATH") and self.check_file(database_file, "os.R_OK"): + + host_country_main = subprocess.check_output([geotool_exec,'-d', database_file, remote_host]).rstrip().decode() + host_country_main = host_country_main.split('\n') + + try: + host_country = host_country_main[0].split(', ')[1] + except: + if re.search("Address not found", host_country_main[0]): + host_country = "Unknown" + + if len(host_country_main) > 1: + try: + host_city = host_country_main[1].split(', ')[4] + if re.search("N/A", host_city): + host_city = "Unknown: " + host_country_main[1].split(', ')[6] + ', ' + host_country_main[1].split(', ')[7] + except: + pass + + return { + 'host_country': host_country, + 'host_city': host_city + } + return None + + """ + Status code filter + """ + def filter_status_code(self, status_codes, final_status): + + skip_line = True + + for status in status_codes: + + # Status consists of numerical status value (num) and validity boolean value (num_ok) + if len(status) != 2: + continue + + num, num_ok = status + + if num_ok: + status = int(num) + + if status == final_status: + skip_line = False + break + + return skip_line + + """ + Country name filter + """ + def filter_country(self, countries, host_country): + + skip_line = True + + for country in countries: + if country[1] == "!": + country = country[2:] + if country.lower() == host_country.lower(): + skip_line = True + break + else: + skip_line = False + + elif country.lower() == host_country.lower(): + skip_line = False + break + + return skip_line + + """ + Get total number of lines in files + """ + def get_file_line_count(self, sfiles): + + lines_in_files = [] + + for sfile in sfiles: + try: + with open(sfile, 'r') as f: + line_count = len(list(f)) + f.close() + lines_in_files.append({ + 'file': str(sfile), + 'lines': int(line_count) + }) + except: + raise Exception("Couldn't read input file " + sfile) + + return lines_in_files + + """ + Date checker + """ + def date_checker(self, date_lower, date_upper, entry_time): + + # TODO Handle situations where date_upper & date_lower are equal + + if date_upper is not None and date_lower is not None: + if date_lower > date_upper: + raise Exception("Earlier day can't be later than later day") + + if date_upper is not None: + if date_upper > datetime.now(): + raise Exception("Day can't be in the future") + + if date_lower is not None: + if date_lower > datetime.now(): + raise Exception("Day can't be in the future") + + if date_lower is not None: + if entry_time <= date_lower: return False + + if date_upper is not None: + if entry_time >= date_upper: return False + + return True + + """ + Get output field definitions (sortby) + """ + def get_out_field(self, fields, field_input): + + i = 0 + for field in fields: + if field == field_input: + return [True, i] + i += 1 + return [False, i] + + """ + Get included fields + """ + def get_included_fields(self, fields, included_fields, excluded_fields=None): + + included_values = [] + all_defined_fields = [] + + if 'all' in included_fields or included_fields is None: + included_fields = [i for i in fields.keys()] + + if excluded_fields is not None: + if 'all' in excluded_fields: + raise Exception("No output fields defined.") +# for i in excluded_fields: +# if i in included_fields: +# raise Exception("Field can't be both included and excluded. Offending field: {}".format(i)) + included_fields = [i for i in included_fields if i not in excluded_fields] + all_defined_fields = included_fields + excluded_fields else: - for path in os.environ[env].split(os.pathsep): - filepath = os.path.join(path, file) - if os.path.isfile(filepath): - break + all_defined_fields = included_fields + + for i in all_defined_fields: + if i not in fields.keys(): + raise Exception("Unknown field value: {}. Accepted values: {}".format(i, ', '.join(fields.keys()))) + + for key, value in fields.items(): + if key in included_fields: + value['included'] = True + else: + value['included'] = False + included_values.append(value['included']) + + if True not in included_values: + raise Exception("No output fields defined.") + return fields + + """ + Process input files + """ + def process_files(self, user_arguments): + + prev_host = "" + log_entries = [] + codes = [] + countries = [] + + # Log format as defined in Apache/HTTPD configuration file (LogFormat directive) or manually by user + if user_arguments.log_format: + log_format = user_arguments.log_format + else: + log_format = self.get_httpd_logformat_directive(user_arguments.httpd_conf_file, user_arguments.httpd_log_nickname) - if os.access(filepath, eval(flag)): - return True + parser = LogParser(log_format) - return False + if user_arguments.codes: + codes = self.get_input_status_codes(self.populate_status_codes(), user_arguments.codes) + + if user_arguments.countries: + countries = user_arguments.countries + + date_lower = user_arguments.date_lower + date_upper = user_arguments.date_upper + day_format = "%d-%m-%Y" + + if date_lower is not None: + date_lower = datetime.strptime(date_lower, day_format) + if date_upper is not None: + date_upper = datetime.strptime(date_upper, day_format) + + files = self.get_files(user_arguments.files_regex, user_arguments.files_list) + + show_progress = user_arguments.show_progress + use_geolocation = user_arguments.use_geolocation -# TODO Really exclude, when no additional args are passed to either of both -if args.sortby_field is not None and args.sortby_field_reverse is not None: - raise Exception("Use either normal or reverse sorting.") + geotool_exec = user_arguments.geotool_exec + geo_database_location = user_arguments.geo_database_location -sortby_field = None -if args.sortby_field is not None: - sortby_field = args.sortby_field - reverse_order = False -elif args.sortby_field_reverse is not None: - sortby_field = args.sortby_field_reverse - reverse_order = True + incl_fields = user_arguments.incl_fields + if isinstance(user_arguments.incl_fields, str): + incl_fields = user_arguments.incl_fields.replace(' ','').split(',') -i = 0 -country_seen = False -prev_host = "" -host_country = "" -host_city = "" -log_entries = [] + fields = self.get_included_fields( + self.get_out_fields(), + incl_fields, + user_arguments.excl_fields + ) -for file in files: - if not no_progress: - print("Processing file: " + file) + if use_geolocation: + fields['country']['included'] = True + fields['city']['included'] = True - with open(log_dir + file, 'r') as f: + if fields['country']['included'] or fields['city']['included']: + use_geolocation = True + + invalid_lines = [] + field_names = [] + i = 0 + country_seen = False + geo_data = None + skip_line_by_status = False + skip_line_by_country = False + + lines_total = sum([i['lines'] for i in self.get_file_line_count(files)]) + + if show_progress: + print( + "File count: {}\nLines in total: {}".format( + str(len(files)), + str(lines_total) + )) + + for lfile in files: + + if show_progress: + print("Processing file: {} (lines: {})".format( + lfile, + str(self.get_file_line_count([lfile])[0]['lines']) + )) + + with open(lfile, 'r') as f: for line in f: - if not no_progress: - print("Processing log entry: " + str(i), end = "\r") - - if i != 0 and not (skip_line_1 or skip_line_2): - prev_host = entry_remote_host - prev_host_time = entry_time - - entry = parser.parse(line) - entry_time = entry.request_time.replace(tzinfo=None) - - # TODO Handle situations where date_upper & date_lower are equal - if day_upper is not None and day_lower is not None: - if day_lower > day_upper: - raise Exception("Earlier day can't be later than later day") - - if day_upper is not None: - if day_upper > datetime.now(): - raise Exception("Day can't be in the future") - - if day_lower is not None: - if day_lower > datetime.now(): - raise Exception("Day can't be in the future") - - if day_lower is not None: - if entry_time <= day_lower: continue - - if day_upper is not None: - if entry_time >= day_upper: continue - - entry_remote_host = entry.remote_host - entry_http_status = entry.final_status - entry_user_agent = entry.headers_in["User-Agent"] - - # In case where request has newline or other similar chars. Tell Python interpreter to escape them - entry_http_request = str(entry.request_line).encode('unicode_escape').decode() - - if status_filter: - for status in code_statuses: - num, num_ok = status - status = int(num) - if status != entry_http_status: - skip_line_1 = True - else: - skip_line_1 = False - break - - if not args.no_geo and fileCheck(geotool, "os.X_OK", "PATH") and fileCheck(geodb, "os.R_OK"): - if prev_host == entry.remote_host: - country_seen = True - else: - country_seen = False - - if not country_seen: - host_country_main = subprocess.check_output([geotool,'-d',geodb,entry_remote_host]).rstrip().decode() - host_country_main = host_country_main.split('\n') - try: - host_country = host_country_main[0].split(', ')[1] - except: - if re.search("Address not found", host_country): - host_country = "Unknown" - - else: - if len(host_country_main) > 1: - try: - host_city = host_country_main[1].split(', ')[4] - except: - if re.search("N/A", host_city): - host_city = "Unknown: " + host_country_main[1].split(', ')[6] + ', ' + host_country_main[1].split(', ')[7] - - if country_filter: - for country in countries_filter_list: - if country[1] == "!": - country = country[2:] - if country.lower() == host_country.lower(): - skip_line_2 = True - break - else: - skip_line_2 = False - - elif country.lower() != host_country.lower(): - skip_line_2 = True - else: - skip_line_2 = False - break + if show_progress: + print("Processing log entry: {} ({}%)".format( + str(i), + round(100 * (i/lines_total), 2) + ), end = "\r") + + if i != 0 and not (skip_line_by_status or skip_line_by_country) and entry_data: + prev_host = entry_data['remote_host'] + prev_host_time = entry_data['time'] + + try: + entry = parser.parse(line) + except InvalidEntryError: + invalid_lines.append((lfile, i + 1)) + continue + + entry_data = { + 'time': entry.request_time.replace(tzinfo = None), + 'user_agent': entry.headers_in["User-Agent"], + 'http_request': str(entry.request_line).encode('unicode_escape').decode(), + 'remote_host': entry.remote_host, + 'status': entry.final_status + } + + if not self.date_checker(date_lower, date_upper, entry_data['time']): + i += 1 + continue + + if len(codes) > 0: + skip_line_by_status = self.filter_status_code(codes, entry_data['status']) + if use_geolocation: + if prev_host == entry_data['remote_host']: + country_seen = True else: - skip_line_2 = False - - if skip_line_1 or skip_line_2: - i += 1 - continue - - time_diff = str("NEW_CONN") - if prev_host == entry_remote_host: - time_diff = ( entry_time - prev_host_time ).total_seconds() - if time_diff > 0: - time_diff = "+" + str(time_diff) - if i == 0: - time_diff = float(0.0) - - # TODO: Optimize stri generation logic, avoid generating multiple times since it's really not necessary - out_fields = [ - ('log_file_name', file, '{:s}' ), - ('http_status', entry_http_status, '{:3s}' ), - ('remote_host', entry_remote_host, '{:15s}'), - ('country', host_country, '{:20s}'), - ('city', host_city, '{:15s}'), - ('time', entry_time, '{:8s}' ), - ('time_diff', time_diff, '{:8s}' ), - ('user_agent', entry_user_agent, '{:s}' ), - ('http_request', entry_http_request, '{:s}' ) - ] - - stri = "" - printargs = [] - t = 0 - while t <= len(out_fields_list) - 1: - - for out_field in out_fields: - entry, data, striformat = out_field - - if args.no_geo and (entry == "country" or entry == "city"): - continue - - if out_fields_list[t] == entry: - stri += "\t" + striformat - printargs.append(data) - break - t += 1 - - log_entries.append(printargs) + country_seen = False + if not country_seen: + geo_data = self.geotool_get_data(geotool_exec, geo_database_location, entry_data['remote_host']) + + if len(countries) > 0 and geo_data is not None: + skip_line_by_country = self.filter_country(countries, geo_data['host_country']) + + else: + skip_line_by_country = False + + if skip_line_by_status or skip_line_by_country: i += 1 + continue + + time_diff = str('NEW_CONN') + if prev_host == entry_data['remote_host']: + time_diff = (entry_data['time'] - prev_host_time).total_seconds() + if isinstance(time_diff, float): + time_diff = int(time_diff) + if time_diff > 0: + time_diff = "+" + str(time_diff) + if i == 0: + time_diff = int(0) + + if fields['log_file_name']['included']: + fields['log_file_name']['data'] = lfile + if fields['http_status']['included']: + fields['http_status']['data'] = entry_data['status'] + if fields['remote_host']['included']: + fields['remote_host']['data'] = entry_data['remote_host'] + + if geo_data is not None: + if fields['country']['included']: + fields['country']['data'] = geo_data['host_country'] + if fields['city']['included']: + fields['city']['data'] = geo_data['host_city'] + + if fields['time']['included']: + fields['time']['data'] = entry_data['time'] + if fields['time_diff']['included']: + fields['time_diff']['data'] = time_diff + if fields['user_agent']['included']: + fields['user_agent']['data'] = entry_data['user_agent'] + if fields['http_request']['included']: + fields['http_request']['data'] = entry_data['http_request'] + + stri = "" + printargs = [] + + for key, value in fields.items(): + if not use_geolocation and (key == 'country' or key == 'city'): + continue + if value['included']: + stri += "\t" + value['format'] + printargs.append(value['data']) + + if not any(key in i for i in field_names): + field_names.append((key, value['human_name'])) + + log_entries.append(printargs) + i += 1 + + return [log_entries, files, i, stri, field_names, invalid_lines] + + """ + Execute + """ + def execute(self): + + user_arguments = self.get_args() + + print_headers = user_arguments.column_headers + show_progress = user_arguments.show_progress + show_stats = user_arguments.show_stats + output_format = user_arguments.output_format + + sortby_field = user_arguments.sortby_field + reverse_order = bool(user_arguments.sortby_reverse) + + if 'all' not in user_arguments.incl_fields: + if sortby_field and sortby_field not in user_arguments.incl_fields: + raise Exception("Sort-by field must be included in output fields.") + + results = self.process_files(user_arguments) + result_entries = results[0] + result_files = results[1] + result_lines = results[2] + stri = results[3] + out_fields = [i[0] for i in results[4]] + out_fields_human_names = [i[1] for i in results[4]] + invalid_lines = results[5] + + if sortby_field is not None: + out_field_validation = self.get_out_field(out_fields, sortby_field) + if out_field_validation[0]: + result_entries.sort( + key = lambda result_entries : result_entries[out_field_validation[1]] or '', + reverse = reverse_order + ) -if sortby_field is not None: - sort_field_found = False - d = 0 - for field in out_fields_list: - if field == sortby_field: - sort_field_index = d - sort_field_found = True - break - d += 1 + if not show_progress: + print("\n") + + if output_format == 'table': + + if print_headers: + print("\n") + print(stri.format(*out_fields_human_names).lstrip()) + + for entry in result_entries: + c = 0 + entry_items = [] + while c < len(entry): + entry_items.append(str(entry[c])) + c += 1 + print(stri.format(*entry_items).lstrip()) - if sort_field_found: - log_entries.sort(key = lambda log_entries: log_entries[sort_field_index], reverse=reverse_order) + if output_format == 'csv': -if not no_progress: - print("\n") + if print_headers: + print("\n") + print(','.join(out_fields_human_names)) -for entry in log_entries: - c = 0 - entry_tmp = [] - while c <= len(entry) - 1: - entry_tmp.append(str(entry[c])) - c += 1 - print(stri.format(*entry_tmp).lstrip()) + for entry in result_entries: + c = 0 + entry_items = [] + while c < len(entry): + entry_items.append(str(entry[c])) + c += 1 + print(','.join(entry_items)) -if args.show_count: - print(("\n" + + if show_stats: + print(("\n" + "Processed files: {:s}\n" + "Processed log entries: {:d}\n" + - "Matched log entries: {:d}\n").format( - ', '.join(files), - i, - len(log_entries) - ) + "Matched log entries: {:d}\n" + ).format( + ', '.join(result_files), + result_lines, + len(result_entries) ) + ) + if len(invalid_lines) > 0: + print("Invalid lines:") + for i in invalid_lines: + print("\tFile: {:s}, line: {:d}".format(i[0], i[1])) + print("\n") + +if __name__ == "__main__": + app = program() + app.execute()