#!/bin/env python # Simple Apache log parser # Copyright (C) 2020 Pekka Helenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ################################################################ # TODO prev_host: instead of comparing to previous entry, check if such IP has been seen in XXX seconds # store IP values for temporary list for XXX seconds, and check list values import argparse import os import re import subprocess import sys from datetime import datetime from apachelogs import LogParser out_fields_list = ['log_file_name', 'http_status', 'remote_host', 'country', 'city', 'time', 'time_diff', 'user_agent', 'http_request'] out_timeformat = "%d-%m-%Y %H:%M:%S" dayformat = "%d-%m-%Y" ot = '"' + re.sub(r'%', '%%', out_timeformat) + '"' geotool = "geoiplookup" geodb = "/usr/share/GeoIP/" # Log format as defined in Apache/HTTPD configuration file (LogFormat directive) in_log_syntax = "%h %u %t %T \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" \"%{cache-status}e\" %I %O" argparser = argparse.ArgumentParser() argparser.add_argument('-d', '--dir', help = 'Apache log file directory.', nargs = '?', dest = 'log_dir', required = True) argparser.add_argument('-f', '--files', help = 'Apache log files. Regular expressions supported.', nargs = '+', dest = 'log_file', required = True) argparser.add_argument('-s', '--logsyntax', help = 'Apache log files syntax, defined as "LogFormat" directive in Apache configuration.', nargs = '?', dest = 'log_syntax') argparser.add_argument('-c', '--statuscodes', help = 'Print only these status codes. Regular expressions supported.', nargs = '+', dest = 'status_code') argparser.add_argument('-cf', '--countryfilter', help = 'Include only these countries. Negative match (exclude): "\!Country"', nargs = '+', dest = 'country') argparser.add_argument('-ot', '--outtimeformat', help = 'Output time format.\nDefault: ' + ot, nargs = '?', dest = 'out_timeformat') argparser.add_argument('-of', '--outfields', help = 'Output fields.\nDefault: ' + ', '.join(out_fields_list), nargs = '+', dest = 'out_field') argparser.add_argument('-ng', '--nogeo', help = 'Skip country check with external "geoiplookup" tool.', action='store_true', dest = 'no_geo') argparser.add_argument('-gd', '--geodir', help = 'Database file directory for "geoiplookup" tool.\nDefault: ' + geodb, nargs = '?', dest = 'geodb') argparser.add_argument('-dl', '--daylower', help = 'Do not check log entries older than this day.\nDay syntax: 31-12-2020', nargs = '?', dest = 'day_lower') argparser.add_argument('-du', '--dayupper', help = 'Do not check log entries newer than this day.\nDay syntax: 31-12-2020', nargs = '?', dest = 'day_upper') argparser.add_argument('-sb', '--sortby', help = 'Sort by an output field.', nargs = '?', dest = 'sortby_field') argparser.add_argument('-sbr', '--sortbyreverse', help = 'Sort by an output field, reverse order.', nargs = '?', dest = 'sortby_field_reverse') argparser.add_argument('-st', '--stats', help = 'Show short statistics at the end.', action='store_true', dest = 'show_count') argparser.add_argument('-np', '--noprogress', help = 'Do not show progress information.', action='store_true', dest = 'no_progress') args = argparser.parse_args() if args.status_code is None: status_filter = False skip_line_1 = False else: status_filter = True skip_line_1 = True status_codes = args.status_code http_valid_codes = [ '100', '101', '102', '103', '200', '201', '202', '203', '204', '205', '206', '207', '208', '226', '300', '301', '302', '303', '304', '305', '306', '307', '308', '400', '401', '402', '403', '404', '405', '406', '407', '408', '409', '410', '411', '412', '413', '414', '415', '416', '417', '418', '421', '422', '423', '424', '425', '426', '428', '429', '431', '451', '500', '501', '502', '503', '504', '505', '506', '507', '508', '510', '511', '218' ] code_statuses = [] for status_input in status_codes: init_status = False status_append = status_input status_appended = False for status_valid in http_valid_codes: if re.search(status_input, status_valid): status_append = status_valid init_status = True status_appended = True code_statuses.append((status_append, init_status)) else: init_status = False if not status_appended: code_statuses.append((status_append, init_status)) error_msg = "" for vl in code_statuses: status, init_status = vl if not init_status: error_msg += "Invalid status code '" + status + "' supplied\n" if error_msg != "": raise Exception("\n" + error_msg) if args.country is None: country_filter = False skip_line_2 = False else: country_filter = True countries_filter_list = args.country skip_line_2 = True if args.out_timeformat is not None: out_timeformat = args.out_timeformat if args.out_field is not None: out_fields_list = args.out_field if args.day_lower is not None: day_lower = datetime.strptime(args.day_lower, dayformat) else: day_lower = None if args.day_upper is not None: day_upper = datetime.strptime(args.day_upper, dayformat) else: day_upper = None if args.log_syntax is None: log_syntax = in_log_syntax else: log_syntax = args.log_syntax log_dir = args.log_dir files = args.log_file no_progress = args.no_progress files_tmp = [] parser = LogParser(log_syntax) for file_regex in files: for file in os.listdir(log_dir): fullpath = log_dir + file if os.path.isfile(fullpath): if re.search(file_regex, file): files_tmp.append(file) files_tmp.sort() files = files_tmp def fileCheck(file, flag, env=None): if env is None: filepath = file else: for path in os.environ[env].split(os.pathsep): filepath = os.path.join(path, file) if os.path.isfile(filepath): break if os.access(filepath, eval(flag)): return True return False # TODO Really exclude, when no additional args are passed to either of both if args.sortby_field is not None and args.sortby_field_reverse is not None: raise Exception("Use either normal or reverse sorting.") sortby_field = None if args.sortby_field is not None: sortby_field = args.sortby_field reverse_order = False elif args.sortby_field_reverse is not None: sortby_field = args.sortby_field_reverse reverse_order = True i = 0 country_seen = False prev_host = "" host_country = "" host_city = "" log_entries = [] for file in files: if not no_progress: print("Processing file: " + file) with open(log_dir + file, 'r') as f: for line in f: if not no_progress: print("Processing log entry: " + str(i), end = "\r") if i != 0 and not (skip_line_1 or skip_line_2): prev_host = entry_remote_host prev_host_time = entry_time entry = parser.parse(line) entry_time = entry.request_time.replace(tzinfo=None) # TODO Handle situations where date_upper & date_lower are equal if day_upper is not None and day_lower is not None: if day_lower > day_upper: raise Exception("Earlier day can't be later than later day") if day_upper is not None: if day_upper > datetime.now(): raise Exception("Day can't be in the future") if day_lower is not None: if day_lower > datetime.now(): raise Exception("Day can't be in the future") if day_lower is not None: if entry_time <= day_lower: continue if day_upper is not None: if entry_time >= day_upper: continue entry_remote_host = entry.remote_host entry_http_status = entry.final_status entry_user_agent = entry.headers_in["User-Agent"] # In case where request has newline or other similar chars. Tell Python interpreter to escape them entry_http_request = str(entry.request_line).encode('unicode_escape').decode() if status_filter: for status in code_statuses: num, num_ok = status status = int(num) if status != entry_http_status: skip_line_1 = True else: skip_line_1 = False break if not args.no_geo and fileCheck(geotool, "os.X_OK", "PATH") and fileCheck(geodb, "os.R_OK"): if prev_host == entry.remote_host: country_seen = True else: country_seen = False if not country_seen: host_country_main = subprocess.check_output([geotool,'-d',geodb,entry_remote_host]).rstrip().decode() host_country_main = host_country_main.split('\n') host_country = re.sub(r"^.*, (.*)", r'\1', host_country_main[0]) if re.search("Address not found", host_country): host_country = "Unknown" else: if len(host_country_main) > 1: host_city = host_country_main[1].split(', ')[4] if re.search("N/A", host_city): host_city = "Unknown: " + host_country_main[1].split(', ')[6] + ', ' + host_country_main[1].split(', ')[7] if country_filter: for country in countries_filter_list: if country[1] == "!": country = country[2:] if country.lower() == host_country.lower(): skip_line_2 = True break else: skip_line_2 = False elif country.lower() != host_country.lower(): skip_line_2 = True else: skip_line_2 = False break else: skip_line_2 = False if skip_line_1 or skip_line_2: i += 1 continue time_diff = str("NEW_CONN") if prev_host == entry_remote_host: time_diff = ( entry_time - prev_host_time ).total_seconds() if time_diff > 0: time_diff = "+" + str(time_diff) if i == 0: time_diff = float(0.0) # TODO: Optimize stri generation logic, avoid generating multiple times since it's really not necessary out_fields = [ ('log_file_name', file, '{:s}' ), ('http_status', entry_http_status, '{:3s}' ), ('remote_host', entry_remote_host, '{:15s}'), ('country', host_country, '{:20s}'), ('city', host_city, '{:15s}'), ('time', entry_time, '{:8s}' ), ('time_diff', time_diff, '{:8s}' ), ('user_agent', entry_user_agent, '{:s}' ), ('http_request', entry_http_request, '{:s}' ) ] stri = "" printargs = [] t = 0 while t <= len(out_fields_list) - 1: for out_field in out_fields: entry, data, striformat = out_field if args.no_geo and (entry == "country" or entry == "city"): continue if out_fields_list[t] == entry: stri += "\t" + striformat printargs.append(data) break t += 1 log_entries.append(printargs) i += 1 if sortby_field is not None: sort_field_found = False d = 0 for field in out_fields_list: if field == sortby_field: sort_field_index = d sort_field_found = True break d += 1 if sort_field_found: log_entries.sort(key = lambda log_entries: log_entries[sort_field_index], reverse=reverse_order) if not no_progress: print("\n") for entry in log_entries: c = 0 entry_tmp = [] while c <= len(entry) - 1: entry_tmp.append(str(entry[c])) c += 1 print(stri.format(*entry_tmp).lstrip()) if args.show_count: print(("\n" + "Processed files: {:s}\n" + "Processed log entries: {:d}\n" + "Matched log entries: {:d}\n").format( ', '.join(files), i, len(log_entries) ) )