Simple Apache/HTTPD log parser for administrative analysis
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

413 lines
14 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. #!/bin/env python
  2. # Simple Apache log parser
  3. # Copyright (C) 2020 Pekka Helenius
  4. #
  5. # This program is free software: you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation, either version 3 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program. If not, see <https://www.gnu.org/licenses/>.
  17. ################################################################
  18. # TODO prev_host: instead of comparing to previous entry, check if such IP has been seen in XXX seconds
  19. # store IP values for temporary list for XXX seconds, and check list values
  20. import argparse
  21. import os
  22. import re
  23. import subprocess
  24. import sys
  25. from datetime import datetime
  26. from apachelogs import LogParser
  27. out_fields_list = ['log_file_name', 'http_status', 'remote_host', 'country', 'city', 'time', 'time_diff', 'user_agent', 'http_request']
  28. out_timeformat = "%d-%m-%Y %H:%M:%S"
  29. dayformat = "%d-%m-%Y"
  30. ot = '"' + re.sub(r'%', '%%', out_timeformat) + '"'
  31. geotool = "geoiplookup"
  32. geodb = "/usr/share/GeoIP/"
  33. # Log format as defined in Apache/HTTPD configuration file (LogFormat directive)
  34. in_log_syntax = "%h %u %t %T \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" \"%{cache-status}e\" %I %O"
  35. argparser = argparse.ArgumentParser()
  36. argparser.add_argument('-d', '--dir', help = 'Apache log file directory.', nargs = '?', dest = 'log_dir', required = True)
  37. argparser.add_argument('-f', '--files', help = 'Apache log files. Regular expressions supported.', nargs = '+', dest = 'log_file', required = True)
  38. argparser.add_argument('-s', '--logsyntax', help = 'Apache log files syntax, defined as "LogFormat" directive in Apache configuration.', nargs = '?', dest = 'log_syntax')
  39. argparser.add_argument('-c', '--statuscodes', help = 'Print only these status codes. Regular expressions supported.', nargs = '+', dest = 'status_code')
  40. argparser.add_argument('-cf', '--countryfilter', help = 'Include only these countries. Negative match (exclude): "\!Country"', nargs = '+', dest = 'country')
  41. argparser.add_argument('-ot', '--outtimeformat', help = 'Output time format.\nDefault: ' + ot, nargs = '?', dest = 'out_timeformat')
  42. argparser.add_argument('-of', '--outfields', help = 'Output fields.\nDefault: ' + ', '.join(out_fields_list), nargs = '+', dest = 'out_field')
  43. argparser.add_argument('-ng', '--nogeo', help = 'Skip country check with external "geoiplookup" tool.', action='store_true', dest = 'no_geo')
  44. argparser.add_argument('-gd', '--geodir', help = 'Database file directory for "geoiplookup" tool.\nDefault: ' + geodb, nargs = '?', dest = 'geodb')
  45. argparser.add_argument('-dl', '--daylower', help = 'Do not check log entries older than this day.\nDay syntax: 31-12-2020', nargs = '?', dest = 'day_lower')
  46. argparser.add_argument('-du', '--dayupper', help = 'Do not check log entries newer than this day.\nDay syntax: 31-12-2020', nargs = '?', dest = 'day_upper')
  47. argparser.add_argument('-sb', '--sortby', help = 'Sort by an output field.', nargs = '?', dest = 'sortby_field')
  48. argparser.add_argument('-sbr', '--sortbyreverse', help = 'Sort by an output field, reverse order.', nargs = '?', dest = 'sortby_field_reverse')
  49. argparser.add_argument('-st', '--stats', help = 'Show short statistics at the end.', action='store_true', dest = 'show_count')
  50. argparser.add_argument('-np', '--noprogress', help = 'Do not show progress information.', action='store_true', dest = 'no_progress')
  51. args = argparser.parse_args()
  52. if args.status_code is None:
  53. status_filter = False
  54. skip_line_1 = False
  55. else:
  56. status_filter = True
  57. skip_line_1 = True
  58. status_codes = args.status_code
  59. http_valid_codes = [
  60. '100',
  61. '101',
  62. '102',
  63. '103',
  64. '200',
  65. '201',
  66. '202',
  67. '203',
  68. '204',
  69. '205',
  70. '206',
  71. '207',
  72. '208',
  73. '226',
  74. '300',
  75. '301',
  76. '302',
  77. '303',
  78. '304',
  79. '305',
  80. '306',
  81. '307',
  82. '308',
  83. '400',
  84. '401',
  85. '402',
  86. '403',
  87. '404',
  88. '405',
  89. '406',
  90. '407',
  91. '408',
  92. '409',
  93. '410',
  94. '411',
  95. '412',
  96. '413',
  97. '414',
  98. '415',
  99. '416',
  100. '417',
  101. '418',
  102. '421',
  103. '422',
  104. '423',
  105. '424',
  106. '425',
  107. '426',
  108. '428',
  109. '429',
  110. '431',
  111. '451',
  112. '500',
  113. '501',
  114. '502',
  115. '503',
  116. '504',
  117. '505',
  118. '506',
  119. '507',
  120. '508',
  121. '510',
  122. '511',
  123. '218'
  124. ]
  125. code_statuses = []
  126. for status_input in status_codes:
  127. init_status = False
  128. status_append = status_input
  129. status_appended = False
  130. for status_valid in http_valid_codes:
  131. if re.search(status_input, status_valid):
  132. status_append = status_valid
  133. init_status = True
  134. status_appended = True
  135. code_statuses.append((status_append, init_status))
  136. else:
  137. init_status = False
  138. if not status_appended:
  139. code_statuses.append((status_append, init_status))
  140. error_msg = ""
  141. for vl in code_statuses:
  142. status, init_status = vl
  143. if not init_status:
  144. error_msg += "Invalid status code '" + status + "' supplied\n"
  145. if error_msg != "":
  146. raise Exception("\n" + error_msg)
  147. if args.country is None:
  148. country_filter = False
  149. skip_line_2 = False
  150. else:
  151. country_filter = True
  152. countries_filter_list = args.country
  153. skip_line_2 = True
  154. if args.out_timeformat is not None:
  155. out_timeformat = args.out_timeformat
  156. if args.out_field is not None:
  157. out_fields_list = args.out_field
  158. if args.day_lower is not None:
  159. day_lower = datetime.strptime(args.day_lower, dayformat)
  160. else:
  161. day_lower = None
  162. if args.day_upper is not None:
  163. day_upper = datetime.strptime(args.day_upper, dayformat)
  164. else:
  165. day_upper = None
  166. if args.log_syntax is None:
  167. log_syntax = in_log_syntax
  168. else:
  169. log_syntax = args.log_syntax
  170. log_dir = args.log_dir
  171. files = args.log_file
  172. no_progress = args.no_progress
  173. files_tmp = []
  174. parser = LogParser(log_syntax)
  175. for file_regex in files:
  176. for file in os.listdir(log_dir):
  177. fullpath = log_dir + file
  178. if os.path.isfile(fullpath):
  179. if re.search(file_regex, file):
  180. files_tmp.append(file)
  181. files_tmp.sort()
  182. files = files_tmp
  183. def fileCheck(file, flag, env=None):
  184. if env is None:
  185. filepath = file
  186. else:
  187. for path in os.environ[env].split(os.pathsep):
  188. filepath = os.path.join(path, file)
  189. if os.path.isfile(filepath):
  190. break
  191. if os.access(filepath, eval(flag)):
  192. return True
  193. return False
  194. # TODO Really exclude, when no additional args are passed to either of both
  195. if args.sortby_field is not None and args.sortby_field_reverse is not None:
  196. raise Exception("Use either normal or reverse sorting.")
  197. sortby_field = None
  198. if args.sortby_field is not None:
  199. sortby_field = args.sortby_field
  200. reverse_order = False
  201. elif args.sortby_field_reverse is not None:
  202. sortby_field = args.sortby_field_reverse
  203. reverse_order = True
  204. i = 0
  205. country_seen = False
  206. prev_host = ""
  207. host_country = ""
  208. host_city = ""
  209. log_entries = []
  210. for file in files:
  211. if not no_progress:
  212. print("Processing file: " + file)
  213. with open(log_dir + file, 'r') as f:
  214. for line in f:
  215. if not no_progress:
  216. print("Processing log entry: " + str(i), end = "\r")
  217. if i != 0 and not (skip_line_1 or skip_line_2):
  218. prev_host = entry_remote_host
  219. prev_host_time = entry_time
  220. entry = parser.parse(line)
  221. entry_time = entry.request_time.replace(tzinfo=None)
  222. # TODO Handle situations where date_upper & date_lower are equal
  223. if day_upper is not None and day_lower is not None:
  224. if day_lower > day_upper:
  225. raise Exception("Earlier day can't be later than later day")
  226. if day_upper is not None:
  227. if day_upper > datetime.now():
  228. raise Exception("Day can't be in the future")
  229. if day_lower is not None:
  230. if day_lower > datetime.now():
  231. raise Exception("Day can't be in the future")
  232. if day_lower is not None:
  233. if entry_time <= day_lower: continue
  234. if day_upper is not None:
  235. if entry_time >= day_upper: continue
  236. entry_remote_host = entry.remote_host
  237. entry_http_status = entry.final_status
  238. entry_user_agent = entry.headers_in["User-Agent"]
  239. # In case where request has newline or other similar chars. Tell Python interpreter to escape them
  240. entry_http_request = str(entry.request_line).encode('unicode_escape').decode()
  241. if status_filter:
  242. for status in code_statuses:
  243. num, num_ok = status
  244. status = int(num)
  245. if status != entry_http_status:
  246. skip_line_1 = True
  247. else:
  248. skip_line_1 = False
  249. break
  250. if not args.no_geo and fileCheck(geotool, "os.X_OK", "PATH") and fileCheck(geodb, "os.R_OK"):
  251. if prev_host == entry.remote_host:
  252. country_seen = True
  253. else:
  254. country_seen = False
  255. if not country_seen:
  256. host_country_main = subprocess.check_output([geotool,'-d',geodb,entry_remote_host]).rstrip().decode()
  257. host_country_main = host_country_main.split('\n')
  258. host_country = re.sub(r"^.*, (.*)", r'\1', host_country_main[0])
  259. if re.search("Address not found", host_country):
  260. host_country = "Unknown"
  261. else:
  262. if len(host_country_main) > 1:
  263. host_city = host_country_main[1].split(', ')[4]
  264. if re.search("N/A", host_city):
  265. host_city = "Unknown: " + host_country_main[1].split(', ')[6] + ', ' + host_country_main[1].split(', ')[7]
  266. if country_filter:
  267. for country in countries_filter_list:
  268. if country[1] == "!":
  269. country = country[2:]
  270. if country.lower() == host_country.lower():
  271. skip_line_2 = True
  272. break
  273. else:
  274. skip_line_2 = False
  275. elif country.lower() != host_country.lower():
  276. skip_line_2 = True
  277. else:
  278. skip_line_2 = False
  279. break
  280. else:
  281. skip_line_2 = False
  282. if skip_line_1 or skip_line_2:
  283. i += 1
  284. continue
  285. time_diff = str("NEW_CONN")
  286. if prev_host == entry_remote_host:
  287. time_diff = ( entry_time - prev_host_time ).total_seconds()
  288. if time_diff > 0:
  289. time_diff = "+" + str(time_diff)
  290. if i == 0:
  291. time_diff = float(0.0)
  292. # TODO: Optimize stri generation logic, avoid generating multiple times since it's really not necessary
  293. out_fields = [
  294. ('log_file_name', file, '{:s}' ),
  295. ('http_status', entry_http_status, '{:3s}' ),
  296. ('remote_host', entry_remote_host, '{:15s}'),
  297. ('country', host_country, '{:20s}'),
  298. ('city', host_city, '{:15s}'),
  299. ('time', entry_time, '{:8s}' ),
  300. ('time_diff', time_diff, '{:8s}' ),
  301. ('user_agent', entry_user_agent, '{:s}' ),
  302. ('http_request', entry_http_request, '{:s}' )
  303. ]
  304. stri = ""
  305. printargs = []
  306. t = 0
  307. while t <= len(out_fields_list) - 1:
  308. for out_field in out_fields:
  309. entry, data, striformat = out_field
  310. if args.no_geo and (entry == "country" or entry == "city"):
  311. continue
  312. if out_fields_list[t] == entry:
  313. stri += "\t" + striformat
  314. printargs.append(data)
  315. break
  316. t += 1
  317. log_entries.append(printargs)
  318. i += 1
  319. if sortby_field is not None:
  320. sort_field_found = False
  321. d = 0
  322. for field in out_fields_list:
  323. if field == sortby_field:
  324. sort_field_index = d
  325. sort_field_found = True
  326. break
  327. d += 1
  328. if sort_field_found:
  329. log_entries.sort(key = lambda log_entries: log_entries[sort_field_index], reverse=reverse_order)
  330. if not no_progress:
  331. print("\n")
  332. for entry in log_entries:
  333. c = 0
  334. entry_tmp = []
  335. while c <= len(entry) - 1:
  336. entry_tmp.append(str(entry[c]))
  337. c += 1
  338. print(stri.format(*entry_tmp).lstrip())
  339. if args.show_count:
  340. print(("\n" +
  341. "Processed files: {:s}\n" +
  342. "Processed log entries: {:d}\n" +
  343. "Matched log entries: {:d}\n").format(
  344. ', '.join(files),
  345. i,
  346. len(log_entries)
  347. )
  348. )