Simple Apache/HTTPD log parser for administrative analysis
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

796 lines
24 KiB

#!/bin/env python
# Simple Apache HTTPD log file parser
# Copyright (C) 2022 Pekka Helenius <pekka [dot] helenius [at] fjordtek [dot] com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
################################################################
# TODO prev_host: instead of comparing to previous entry, check if such IP has been seen in XXX seconds
# TODO: store IP values for temporary list for XXX seconds, and check list values
import argparse
import os
import re
import subprocess
from datetime import datetime
from apachelogs import LogParser, InvalidEntryError
class program(object):
"""
Init
"""
def __init__(self):
self.get_args()
# Exclude private IP address classes from geo lookup process
# 127.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
self.private_class_ip_networks = ['^127\.', '^172\.[1[6-9]|2[0-9]|3[0-1]]\.', '^192\.168\.']
"""
Define & get output fields
"""
def get_out_fields(self):
out_fields = {
'log_file_name': {'data': None, 'format': '{:s}', 'included': False, 'human_name': 'Log file name'},
'http_status': {'data': None, 'format': '{:3s}', 'included': True, 'human_name': 'Status'},
'remote_host': {'data': None, 'format': '{:15s}', 'included': True, 'human_name': 'Remote IP'},
'country': {'data': None, 'format': '{:20s}', 'included': False, 'human_name': 'Country'},
'city': {'data': None, 'format': '{:15s}', 'included': False, 'human_name': 'City'},
'time': {'data': None, 'format': '{:20s}', 'included': True, 'human_name': 'Date/Time'},
'time_diff': {'data': None, 'format': '{:8s}', 'included': True, 'human_name': 'Time diff'},
'user_agent': {'data': None, 'format': '{:s}', 'included': True, 'human_name': 'User agent'},
'http_request': {'data': None, 'format': '{:s}', 'included': True, 'human_name': 'Request'}
}
return out_fields
"""
Argument parser
"""
def get_args(self):
all_fields = self.get_out_fields()
incl_fields = [i for i in all_fields.keys() if all_fields[i]['included']]
out_time_format = "%d-%m-%Y %H:%M:%S"
argparser = argparse.ArgumentParser(
description = 'Apache HTTPD server log parser',
formatter_class = argparse.ArgumentDefaultsHelpFormatter
)
argparser.add_argument(
'-fr', '--files-regex',
help = 'Apache log files matching input regular expression.',
nargs = '?',
dest = 'files_regex',
required = False
)
argparser.add_argument(
'-f', '--files-list',
help = 'Apache log files.\nRegular expressions supported.',
nargs = '?',
type = lambda x: [i for i in x.split(',')],
dest = 'files_list',
required = False
)
argparser.add_argument(
'-c', '--status-codes',
help = 'Print only these numerical status codes.\nRegular expressions supported.',
nargs = '+',
dest = 'codes'
)
argparser.add_argument(
'-cf', '--countries',
help = 'Include only these countries.\nNegative match (exclude): "\!Country"',
nargs = '?',
type = lambda x: [i for i in x.split(',')],
dest = 'countries'
)
argparser.add_argument(
'-tf', '--time-format',
help = 'Output time format.',
nargs = '?',
dest = 'time_format',
default = out_time_format
)
argparser.add_argument(
'-if', '--included-fields',
help = 'Included fields.\nAll fields: all, ' + ', '.join(all_fields),
nargs = '?',
dest = 'incl_fields',
type = lambda x: [i for i in x.split(',')],
default = ', '.join(incl_fields)
)
argparser.add_argument(
'-ef', '--excluded-fields',
help = 'Excluded fields.',
nargs = '?',
dest = 'excl_fields',
type = lambda x: [i for i in x.split(',')],
default = None
)
argparser.add_argument(
'-gl', '--geo-location',
help = 'Check origin countries with external "geoiplookup" tool.\nNOTE: Automatically includes "country" and "city" fields.',
action = 'store_true',
dest = 'use_geolocation'
)
argparser.add_argument(
'-ge', '--geotool-exec',
help = '"geoiplookup" tool executable found in PATH.',
nargs = '?',
dest = 'geotool_exec',
default = "geoiplookup"
)
argparser.add_argument(
'-gd', '--geo-database-dir',
help = 'Database file directory for "geoiplookup" tool.',
nargs = '?',
dest = 'geo_database_location',
default = '/usr/share/GeoIP/'
)
argparser.add_argument(
'-dl', '--day-lower',
help = 'Do not check log entries older than this day.\nDay syntax: 31-12-2020',
nargs = '?',
dest = 'date_lower'
)
argparser.add_argument(
'-du', '--day-upper',
help = 'Do not check log entries newer than this day.\nDay syntax: 31-12-2020',
nargs = '?',
dest = 'date_upper'
)
argparser.add_argument(
'-sb', '--sort-by',
help = 'Sort by an output field.',
nargs = '?',
dest = 'sortby_field'
)
argparser.add_argument(
'-ro', '--reverse-order',
help = 'Sort in reverse order.',
dest = 'sortby_reverse',
action = 'store_true'
)
argparser.add_argument(
'-st', '--show-stats',
help = 'Show short statistics at the end.',
action = 'store_true',
dest = 'show_stats'
)
argparser.add_argument(
'-p', '--show-progress',
help = 'Show progress information.',
dest = 'show_progress',
action = 'store_true'
)
argparser.add_argument(
'--httpd-conf-file',
help = 'Apache HTTPD configuration file with LogFormat directive.',
action = 'store_true',
dest = 'httpd_conf_file',
default = '/etc/httpd/conf/httpd.conf'
)
argparser.add_argument(
'--httpd-log-nickname',
help = 'LogFormat directive nickname',
action = 'store_true',
dest = 'httpd_log_nickname',
default = 'combinedio'
)
argparser.add_argument(
'-lf', '--log-format',
help = 'Log format, manually defined.',
dest = 'log_format',
required = False
)
argparser.add_argument(
'-ph', '--print-headers',
help = 'Print column headers.',
dest = 'column_headers',
required = False,
action = 'store_true'
)
argparser.add_argument(
'--output-format',
help = 'Output format for results.',
dest = 'output_format',
required = False,
default = 'table',
choices = ['table', 'csv']
)
args = argparser.parse_args()
return args
"""
Populate recognized HTTP status codes
"""
def populate_status_codes(self):
http_valid_codes = [
'100-103',
'200-208',
'218'
'226',
'300-308',
'400-431',
'451',
'500-511'
]
codes = []
for code in http_valid_codes:
if len(code.split('-')) == 2:
code_start = int(code.split('-')[0])
code_end = int(code.split('-')[1])
for i in range(code_start,code_end):
codes.append(str(i))
else:
codes.append(code)
return codes
"""
Get valid HTTP status codes from user input
"""
def get_input_status_codes(self, valid_codes, user_codes):
codes = []
for user_code in user_codes:
user_code = str(user_code)
validated = False
code_appended = False
for valid_code in valid_codes:
if re.search(user_code, valid_code):
validated = True
code_appended = True
codes.append((valid_code, validated))
else:
validated = False
if not code_appended:
codes.append((user_code, validated))
return codes
"""
Get log file list
"""
def get_files(self, files_regex=None, files_list=None):
files = []
if files_regex is None and files_list is None:
raise Exception("Either single file or regex file selection method is required.")
if files_regex and files_list:
raise Exception("Single file and regex file selection methods are mutually exclusive.")
if files_regex:
log_dir = '/'.join(files_regex.split('/')[:-1])
file_part = files_regex.split('/')[-1]
for lfile in os.listdir(log_dir):
if os.path.isfile(log_dir + '/' + lfile):
if re.match(file_part, lfile):
files.append(log_dir + '/' + lfile)
if files_list:
for lfile in files_list:
if os.path.isfile(lfile):
files.append(lfile)
if len(files) == 0:
raise Exception("No matching files found.")
files.sort()
return files
"""
Common file checker
"""
def check_file(self, sfile, flag, env = None):
file_path = sfile
if env is not None:
for path in os.environ[env].split(os.pathsep):
file_path = os.path.join(path, sfile)
if os.path.isfile(file_path):
break
if os.access(file_path, eval(flag)):
return True
return False
"""
Get Apache HTTPD LogFormat directive syntax
"""
def get_httpd_logformat_directive(self, cfile, tag=None):
try:
log_format = None
with open(cfile, 'r') as f:
for line in f:
if re.search('^[ ]+LogFormat ".*' + tag, line):
r = re.search('^[ ]+LogFormat "(.*)(!?("))', line)
log_format = r.groups()[0].replace('\\', '')
break
f.close()
return log_format
except:
raise Exception("Couldn't open Apache HTTPD configuration file.")
"""
Geotool processing
"""
def geotool_get_data(self, geotool_exec, database_file, remote_host):
host_country = None
host_city = None
if re.match('|'.join(self.private_class_ip_networks), remote_host):
host_country = "Local"
host_city = "Local"
return {
'host_country': host_country,
'host_city': host_city
}
if self.check_file(geotool_exec, "os.X_OK", "PATH") and self.check_file(database_file, "os.R_OK"):
host_country_main = subprocess.check_output([geotool_exec,'-d', database_file, remote_host]).rstrip().decode()
host_country_main = host_country_main.split('\n')
try:
host_country = host_country_main[0].split(', ')[1]
except:
if re.search("Address not found", host_country_main[0]):
host_country = "Unknown"
if len(host_country_main) > 1:
try:
host_city = host_country_main[1].split(', ')[4]
if re.search("N/A", host_city):
host_city = "Unknown: " + host_country_main[1].split(', ')[6] + ', ' + host_country_main[1].split(', ')[7]
except:
pass
return {
'host_country': host_country,
'host_city': host_city
}
return None
"""
Status code filter
"""
def filter_status_code(self, status_codes, final_status):
skip_line = True
for status in status_codes:
# Status consists of numerical status value (num) and validity boolean value (num_ok)
if len(status) != 2:
continue
num, num_ok = status
if num_ok:
status = int(num)
if status == final_status:
skip_line = False
break
return skip_line
"""
Country name filter
"""
def filter_country(self, countries, host_country):
skip_line = True
for country in countries:
if country[1] == "!":
country = country[2:]
if country.lower() == host_country.lower():
skip_line = True
break
else:
skip_line = False
elif country.lower() == host_country.lower():
skip_line = False
break
return skip_line
"""
Get total number of lines in files
"""
def get_file_line_count(self, sfiles):
lines_in_files = []
for sfile in sfiles:
try:
with open(sfile, 'r') as f:
line_count = len(list(f))
f.close()
lines_in_files.append({
'file': str(sfile),
'lines': int(line_count)
})
except:
raise Exception("Couldn't read input file " + sfile)
return lines_in_files
"""
Date checker
"""
def date_checker(self, date_lower, date_upper, entry_time):
# TODO Handle situations where date_upper & date_lower are equal
if date_upper is not None and date_lower is not None:
if date_lower > date_upper:
raise Exception("Earlier day can't be later than later day")
if date_upper is not None:
if date_upper > datetime.now():
raise Exception("Day can't be in the future")
if date_lower is not None:
if date_lower > datetime.now():
raise Exception("Day can't be in the future")
if date_lower is not None:
if entry_time <= date_lower: return False
if date_upper is not None:
if entry_time >= date_upper: return False
return True
"""
Get output field definitions (sortby)
"""
def get_out_field(self, fields, field_input):
i = 0
for field in fields:
if field == field_input:
return [True, i]
i += 1
return [False, i]
"""
Get included fields
"""
def get_included_fields(self, fields, included_fields, excluded_fields=None):
included_values = []
all_defined_fields = []
if 'all' in included_fields or included_fields is None:
included_fields = [i for i in fields.keys()]
if excluded_fields is not None:
if 'all' in excluded_fields:
raise Exception("No output fields defined.")
# for i in excluded_fields:
# if i in included_fields:
# raise Exception("Field can't be both included and excluded. Offending field: {}".format(i))
included_fields = [i for i in included_fields if i not in excluded_fields]
all_defined_fields = included_fields + excluded_fields
else:
all_defined_fields = included_fields
for i in all_defined_fields:
if i not in fields.keys():
raise Exception("Unknown field value: {}. Accepted values: {}".format(i, ', '.join(fields.keys())))
for key, value in fields.items():
if key in included_fields:
value['included'] = True
else:
value['included'] = False
included_values.append(value['included'])
if True not in included_values:
raise Exception("No output fields defined.")
return fields
"""
Process input files
"""
def process_files(self, user_arguments):
prev_host = ""
log_entries = []
codes = []
countries = []
# Log format as defined in Apache/HTTPD configuration file (LogFormat directive) or manually by user
if user_arguments.log_format:
log_format = user_arguments.log_format
else:
log_format = self.get_httpd_logformat_directive(user_arguments.httpd_conf_file, user_arguments.httpd_log_nickname)
parser = LogParser(log_format)
if user_arguments.codes:
codes = self.get_input_status_codes(self.populate_status_codes(), user_arguments.codes)
if user_arguments.countries:
countries = user_arguments.countries
date_lower = user_arguments.date_lower
date_upper = user_arguments.date_upper
day_format = "%d-%m-%Y"
if date_lower is not None:
date_lower = datetime.strptime(date_lower, day_format)
if date_upper is not None:
date_upper = datetime.strptime(date_upper, day_format)
files = self.get_files(user_arguments.files_regex, user_arguments.files_list)
show_progress = user_arguments.show_progress
use_geolocation = user_arguments.use_geolocation
geotool_exec = user_arguments.geotool_exec
geo_database_location = user_arguments.geo_database_location
incl_fields = user_arguments.incl_fields
if isinstance(user_arguments.incl_fields, str):
incl_fields = user_arguments.incl_fields.replace(' ','').split(',')
fields = self.get_included_fields(
self.get_out_fields(),
incl_fields,
user_arguments.excl_fields
)
if use_geolocation:
fields['country']['included'] = True
fields['city']['included'] = True
if fields['country']['included'] or fields['city']['included']:
use_geolocation = True
invalid_lines = []
field_names = []
i = 0
country_seen = False
geo_data = None
skip_line_by_status = False
skip_line_by_country = False
lines_total = sum([i['lines'] for i in self.get_file_line_count(files)])
if show_progress:
print(
"File count: {}\nLines in total: {}".format(
str(len(files)),
str(lines_total)
))
for lfile in files:
if show_progress:
print("Processing file: {} (lines: {})".format(
lfile,
str(self.get_file_line_count([lfile])[0]['lines'])
))
with open(lfile, 'r') as f:
for line in f:
if show_progress:
print("Processing log entry: {} ({}%)".format(
str(i),
round(100 * (i/lines_total), 2)
), end = "\r")
if i != 0 and not (skip_line_by_status or skip_line_by_country) and entry_data:
prev_host = entry_data['remote_host']
prev_host_time = entry_data['time']
try:
entry = parser.parse(line)
except InvalidEntryError:
invalid_lines.append((lfile, i + 1))
continue
entry_data = {
'time': entry.request_time.replace(tzinfo = None),
'user_agent': entry.headers_in["User-Agent"],
'http_request': str(entry.request_line).encode('unicode_escape').decode(),
'remote_host': entry.remote_host,
'status': entry.final_status
}
if not self.date_checker(date_lower, date_upper, entry_data['time']):
i += 1
continue
if len(codes) > 0:
skip_line_by_status = self.filter_status_code(codes, entry_data['status'])
if use_geolocation:
if prev_host == entry_data['remote_host']:
country_seen = True
else:
country_seen = False
if not country_seen:
geo_data = self.geotool_get_data(geotool_exec, geo_database_location, entry_data['remote_host'])
if len(countries) > 0 and geo_data is not None:
skip_line_by_country = self.filter_country(countries, geo_data['host_country'])
else:
skip_line_by_country = False
if skip_line_by_status or skip_line_by_country:
i += 1
continue
time_diff = str('NEW_CONN')
if prev_host == entry_data['remote_host']:
time_diff = (entry_data['time'] - prev_host_time).total_seconds()
if isinstance(time_diff, float):
time_diff = int(time_diff)
if time_diff > 0:
time_diff = "+" + str(time_diff)
if i == 0:
time_diff = int(0)
if fields['log_file_name']['included']:
fields['log_file_name']['data'] = lfile
if fields['http_status']['included']:
fields['http_status']['data'] = entry_data['status']
if fields['remote_host']['included']:
fields['remote_host']['data'] = entry_data['remote_host']
if geo_data is not None:
if fields['country']['included']:
fields['country']['data'] = geo_data['host_country']
if fields['city']['included']:
fields['city']['data'] = geo_data['host_city']
if fields['time']['included']:
fields['time']['data'] = entry_data['time']
if fields['time_diff']['included']:
fields['time_diff']['data'] = time_diff
if fields['user_agent']['included']:
fields['user_agent']['data'] = entry_data['user_agent']
if fields['http_request']['included']:
fields['http_request']['data'] = entry_data['http_request']
stri = ""
printargs = []
for key, value in fields.items():
if not use_geolocation and (key == 'country' or key == 'city'):
continue
if value['included']:
stri += "\t" + value['format']
printargs.append(value['data'])
if not any(key in i for i in field_names):
field_names.append((key, value['human_name']))
log_entries.append(printargs)
i += 1
return [log_entries, files, i, stri, field_names, invalid_lines]
"""
Execute
"""
def execute(self):
user_arguments = self.get_args()
print_headers = user_arguments.column_headers
show_progress = user_arguments.show_progress
show_stats = user_arguments.show_stats
output_format = user_arguments.output_format
sortby_field = user_arguments.sortby_field
reverse_order = bool(user_arguments.sortby_reverse)
if 'all' not in user_arguments.incl_fields:
if sortby_field and sortby_field not in user_arguments.incl_fields:
raise Exception("Sort-by field must be included in output fields.")
results = self.process_files(user_arguments)
result_entries = results[0]
result_files = results[1]
result_lines = results[2]
stri = results[3]
out_fields = [i[0] for i in results[4]]
out_fields_human_names = [i[1] for i in results[4]]
invalid_lines = results[5]
if sortby_field is not None:
out_field_validation = self.get_out_field(out_fields, sortby_field)
if out_field_validation[0]:
result_entries.sort(
key = lambda result_entries : result_entries[out_field_validation[1]] or '',
reverse = reverse_order
)
if not show_progress:
print("\n")
if output_format == 'table':
if print_headers:
print("\n")
print(stri.format(*out_fields_human_names).lstrip())
for entry in result_entries:
c = 0
entry_items = []
while c < len(entry):
entry_items.append(str(entry[c]))
c += 1
print(stri.format(*entry_items).lstrip())
if output_format == 'csv':
if print_headers:
print("\n")
print(','.join(out_fields_human_names))
for entry in result_entries:
c = 0
entry_items = []
while c < len(entry):
entry_items.append(str(entry[c]))
c += 1
print(','.join(entry_items))
if show_stats:
print(("\n" +
"Processed files: {:s}\n" +
"Processed log entries: {:d}\n" +
"Matched log entries: {:d}\n"
).format(
', '.join(result_files),
result_lines,
len(result_entries)
)
)
if len(invalid_lines) > 0:
print("Invalid lines:")
for i in invalid_lines:
print("\tFile: {:s}, line: {:d}".format(i[0], i[1]))
print("\n")
if __name__ == "__main__":
app = program()
app.execute()