|
|
- #!/bin/env python
-
- # Simple Apache log parser
- # Copyright (C) 2020 Pekka Helenius
- #
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <https://www.gnu.org/licenses/>.
-
- ################################################################
-
- # TODO prev_host: instead of comparing to previous entry, check if such IP has been seen in XXX seconds
- # store IP values for temporary list for XXX seconds, and check list values
-
- import argparse
- import os
- import re
- import subprocess
- import sys
- from datetime import datetime
- from apachelogs import LogParser
-
- out_fields_list = ['log_file_name', 'http_status', 'remote_host', 'country', 'city', 'time', 'time_diff', 'user_agent', 'http_request']
- out_timeformat = "%d-%m-%Y %H:%M:%S"
- dayformat = "%d-%m-%Y"
- ot = '"' + re.sub(r'%', '%%', out_timeformat) + '"'
- geotool = "geoiplookup"
- geodb = "/usr/share/GeoIP/"
-
- # Log format as defined in Apache/HTTPD configuration file (LogFormat directive)
- in_log_syntax = "%h %u %t %T \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" \"%{cache-status}e\" %I %O"
-
- argparser = argparse.ArgumentParser()
-
- argparser.add_argument('-d', '--dir', help = 'Apache log file directory.', nargs = '?', dest = 'log_dir', required = True)
- argparser.add_argument('-f', '--files', help = 'Apache log files. Regular expressions supported.', nargs = '+', dest = 'log_file', required = True)
- argparser.add_argument('-s', '--logsyntax', help = 'Apache log files syntax, defined as "LogFormat" directive in Apache configuration.', nargs = '?', dest = 'log_syntax')
- argparser.add_argument('-c', '--statuscodes', help = 'Print only these status codes. Regular expressions supported.', nargs = '+', dest = 'status_code')
- argparser.add_argument('-cf', '--countryfilter', help = 'Include only these countries. Negative match (exclude): "\!Country"', nargs = '+', dest = 'country')
- argparser.add_argument('-ot', '--outtimeformat', help = 'Output time format.\nDefault: ' + ot, nargs = '?', dest = 'out_timeformat')
- argparser.add_argument('-of', '--outfields', help = 'Output fields.\nDefault: ' + ', '.join(out_fields_list), nargs = '+', dest = 'out_field')
- argparser.add_argument('-ng', '--nogeo', help = 'Skip country check with external "geoiplookup" tool.', action='store_true', dest = 'no_geo')
- argparser.add_argument('-gd', '--geodir', help = 'Database file directory for "geoiplookup" tool.\nDefault: ' + geodb, nargs = '?', dest = 'geodb')
- argparser.add_argument('-dl', '--daylower', help = 'Do not check log entries older than this day.\nDay syntax: 31-12-2020', nargs = '?', dest = 'day_lower')
- argparser.add_argument('-du', '--dayupper', help = 'Do not check log entries newer than this day.\nDay syntax: 31-12-2020', nargs = '?', dest = 'day_upper')
- argparser.add_argument('-sb', '--sortby', help = 'Sort by an output field.', nargs = '?', dest = 'sortby_field')
- argparser.add_argument('-sbr', '--sortbyreverse', help = 'Sort by an output field, reverse order.', nargs = '?', dest = 'sortby_field_reverse')
- argparser.add_argument('-st', '--stats', help = 'Show short statistics at the end.', action='store_true', dest = 'show_count')
- argparser.add_argument('-np', '--noprogress', help = 'Do not show progress information.', action='store_true', dest = 'no_progress')
- args = argparser.parse_args()
-
- if args.status_code is None:
- status_filter = False
- skip_line_1 = False
- else:
- status_filter = True
- skip_line_1 = True
- status_codes = args.status_code
-
- http_valid_codes = [
- '100',
- '101',
- '102',
- '103',
- '200',
- '201',
- '202',
- '203',
- '204',
- '205',
- '206',
- '207',
- '208',
- '226',
- '300',
- '301',
- '302',
- '303',
- '304',
- '305',
- '306',
- '307',
- '308',
- '400',
- '401',
- '402',
- '403',
- '404',
- '405',
- '406',
- '407',
- '408',
- '409',
- '410',
- '411',
- '412',
- '413',
- '414',
- '415',
- '416',
- '417',
- '418',
- '421',
- '422',
- '423',
- '424',
- '425',
- '426',
- '428',
- '429',
- '431',
- '451',
- '500',
- '501',
- '502',
- '503',
- '504',
- '505',
- '506',
- '507',
- '508',
- '510',
- '511',
- '218'
- ]
-
- code_statuses = []
- for status_input in status_codes:
- init_status = False
- status_append = status_input
- status_appended = False
-
- for status_valid in http_valid_codes:
-
- if re.search(status_input, status_valid):
- status_append = status_valid
- init_status = True
- status_appended = True
- code_statuses.append((status_append, init_status))
- else:
- init_status = False
- if not status_appended:
- code_statuses.append((status_append, init_status))
-
- error_msg = ""
- for vl in code_statuses:
- status, init_status = vl
-
- if not init_status:
- error_msg += "Invalid status code '" + status + "' supplied\n"
-
- if error_msg != "":
- raise Exception("\n" + error_msg)
-
- if args.country is None:
- country_filter = False
- skip_line_2 = False
- else:
- country_filter = True
- countries_filter_list = args.country
- skip_line_2 = True
-
- if args.out_timeformat is not None:
- out_timeformat = args.out_timeformat
-
- if args.out_field is not None:
- out_fields_list = args.out_field
-
- if args.day_lower is not None:
- day_lower = datetime.strptime(args.day_lower, dayformat)
- else:
- day_lower = None
- if args.day_upper is not None:
- day_upper = datetime.strptime(args.day_upper, dayformat)
- else:
- day_upper = None
-
- if args.log_syntax is None:
- log_syntax = in_log_syntax
- else:
- log_syntax = args.log_syntax
-
- log_dir = args.log_dir
- files = args.log_file
- no_progress = args.no_progress
- files_tmp = []
- parser = LogParser(log_syntax)
-
- for file_regex in files:
- for file in os.listdir(log_dir):
- fullpath = log_dir + file
- if os.path.isfile(fullpath):
- if re.search(file_regex, file):
- files_tmp.append(file)
-
- files_tmp.sort()
- files = files_tmp
-
- def fileCheck(file, flag, env=None):
- if env is None:
- filepath = file
- else:
- for path in os.environ[env].split(os.pathsep):
- filepath = os.path.join(path, file)
- if os.path.isfile(filepath):
- break
-
- if os.access(filepath, eval(flag)):
- return True
-
- return False
-
- # TODO Really exclude, when no additional args are passed to either of both
- if args.sortby_field is not None and args.sortby_field_reverse is not None:
- raise Exception("Use either normal or reverse sorting.")
-
- sortby_field = None
- if args.sortby_field is not None:
- sortby_field = args.sortby_field
- reverse_order = False
- elif args.sortby_field_reverse is not None:
- sortby_field = args.sortby_field_reverse
- reverse_order = True
-
- i = 0
- country_seen = False
- prev_host = ""
- host_country = ""
- host_city = ""
- log_entries = []
-
- for file in files:
- if not no_progress:
- print("Processing file: " + file)
-
- with open(log_dir + file, 'r') as f:
-
- for line in f:
-
- if not no_progress:
- print("Processing log entry: " + str(i), end = "\r")
-
- if i != 0 and not (skip_line_1 or skip_line_2):
- prev_host = entry_remote_host
- prev_host_time = entry_time
-
- entry = parser.parse(line)
- entry_time = entry.request_time.replace(tzinfo=None)
-
- # TODO Handle situations where date_upper & date_lower are equal
- if day_upper is not None and day_lower is not None:
- if day_lower > day_upper:
- raise Exception("Earlier day can't be later than later day")
-
- if day_upper is not None:
- if day_upper > datetime.now():
- raise Exception("Day can't be in the future")
-
- if day_lower is not None:
- if day_lower > datetime.now():
- raise Exception("Day can't be in the future")
-
- if day_lower is not None:
- if entry_time <= day_lower: continue
-
- if day_upper is not None:
- if entry_time >= day_upper: continue
-
- entry_remote_host = entry.remote_host
- entry_http_status = entry.final_status
- entry_user_agent = entry.headers_in["User-Agent"]
-
- # In case where request has newline or other similar chars. Tell Python interpreter to escape them
- entry_http_request = str(entry.request_line).encode('unicode_escape').decode()
-
- if status_filter:
- for status in code_statuses:
- num, num_ok = status
- status = int(num)
- if status != entry_http_status:
- skip_line_1 = True
- else:
- skip_line_1 = False
- break
-
- if not args.no_geo and fileCheck(geotool, "os.X_OK", "PATH") and fileCheck(geodb, "os.R_OK"):
- if prev_host == entry.remote_host:
- country_seen = True
- else:
- country_seen = False
-
- if not country_seen:
- host_country_main = subprocess.check_output([geotool,'-d',geodb,entry_remote_host]).rstrip().decode()
- host_country_main = host_country_main.split('\n')
- host_country = re.sub(r"^.*, (.*)", r'\1', host_country_main[0])
-
- if re.search("Address not found", host_country):
- host_country = "Unknown"
-
- else:
- if len(host_country_main) > 1:
- host_city = host_country_main[1].split(', ')[4]
- if re.search("N/A", host_city):
- host_city = "Unknown: " + host_country_main[1].split(', ')[6] + ', ' + host_country_main[1].split(', ')[7]
-
- if country_filter:
- for country in countries_filter_list:
- if country[1] == "!":
- country = country[2:]
- if country.lower() == host_country.lower():
- skip_line_2 = True
- break
- else:
- skip_line_2 = False
-
- elif country.lower() != host_country.lower():
- skip_line_2 = True
- else:
- skip_line_2 = False
- break
-
- else:
- skip_line_2 = False
-
- if skip_line_1 or skip_line_2:
- i += 1
- continue
-
- time_diff = str("NEW_CONN")
- if prev_host == entry_remote_host:
- time_diff = ( entry_time - prev_host_time ).total_seconds()
- if time_diff > 0:
- time_diff = "+" + str(time_diff)
- if i == 0:
- time_diff = float(0.0)
-
- # TODO: Optimize stri generation logic, avoid generating multiple times since it's really not necessary
- out_fields = [
- ('log_file_name', file, '{:s}' ),
- ('http_status', entry_http_status, '{:3s}' ),
- ('remote_host', entry_remote_host, '{:15s}'),
- ('country', host_country, '{:20s}'),
- ('city', host_city, '{:15s}'),
- ('time', entry_time, '{:8s}' ),
- ('time_diff', time_diff, '{:8s}' ),
- ('user_agent', entry_user_agent, '{:s}' ),
- ('http_request', entry_http_request, '{:s}' )
- ]
-
- stri = ""
- printargs = []
- t = 0
- while t <= len(out_fields_list) - 1:
-
- for out_field in out_fields:
- entry, data, striformat = out_field
-
- if args.no_geo and (entry == "country" or entry == "city"):
- continue
-
- if out_fields_list[t] == entry:
- stri += "\t" + striformat
- printargs.append(data)
- break
- t += 1
-
- log_entries.append(printargs)
-
- i += 1
-
- if sortby_field is not None:
- sort_field_found = False
- d = 0
- for field in out_fields_list:
- if field == sortby_field:
- sort_field_index = d
- sort_field_found = True
- break
- d += 1
-
- if sort_field_found:
- log_entries.sort(key = lambda log_entries: log_entries[sort_field_index], reverse=reverse_order)
-
- if not no_progress:
- print("\n")
-
- for entry in log_entries:
- c = 0
- entry_tmp = []
- while c <= len(entry) - 1:
- entry_tmp.append(str(entry[c]))
- c += 1
- print(stri.format(*entry_tmp).lstrip())
-
- if args.show_count:
- print(("\n" +
- "Processed files: {:s}\n" +
- "Processed log entries: {:d}\n" +
- "Matched log entries: {:d}\n").format(
- ', '.join(files),
- i,
- len(log_entries)
- )
- )
|