Simple Apache/HTTPD log parser for administrative analysis
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1101 lines
34 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. #!/bin/env python
  2. # Simple Apache HTTPD log file parser
  3. # Copyright (C) 2022 Pekka Helenius <pekka [dot] helenius [at] fjordtek [dot] com>
  4. #
  5. # This program is free software: you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation, either version 3 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program. If not, see <https://www.gnu.org/licenses/>.
  17. ################################################################
  18. # TODO: prev_host: instead of comparing to previous entry, check if such IP has been seen in XXX seconds
  19. # TODO: store IP values for temporary list for XXX seconds, and check list values
  20. # TODO: implement warning check for geoiplookup tool database files, i.e. "warning, some geo database files are very old. Please consider updating geo database information."
  21. import argparse
  22. import os
  23. import re
  24. import subprocess
  25. from datetime import datetime
  26. from apachelogs import LogParser, InvalidEntryError
  27. class text_processing(object):
  28. """
  29. Init
  30. """
  31. def __init__(self, verbose):
  32. self.show_verbose = verbose
  33. """
  34. Verbose output format (we do not use logger library)
  35. """
  36. def print_verbose(self, prefix='output', *args):
  37. if self.show_verbose:
  38. print('VERBOSE [{:s}]: {:s}'.format(prefix, ', '.join([str(i) for i in args])))
  39. class program(object):
  40. """
  41. Init
  42. """
  43. def __init__(self):
  44. self.args = self.get_args()
  45. # Exclude private IP address classes from geo lookup process
  46. # Strip out %I and %O flags from Apache log format
  47. # 127.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
  48. self.private_class_ip_networks = ['^127\.', '^172\.(1[6-9]{1}|2[0-9]{1}|3[0-1]{1})\.', '^192\.168\.']
  49. self.txt = text_processing(verbose = self.args.verbose)
  50. """
  51. Define & get output fields
  52. """
  53. def get_out_fields(self):
  54. out_fields = {
  55. 'log_file_name': {'data': None, 'format': '{:s}', 'included': False, 'human_name': 'Log file name', 'sort_index': 0},
  56. 'http_status': {'data': None, 'format': '{:3s}', 'included': True, 'human_name': 'Status', 'sort_index': 1},
  57. 'remote_host': {'data': None, 'format': '{:15s}', 'included': True, 'human_name': 'Remote IP', 'sort_index': 2},
  58. 'country': {'data': None, 'format': '{:20s}', 'included': False, 'human_name': 'Country', 'sort_index': 3},
  59. 'city': {'data': None, 'format': '{:15s}', 'included': False, 'human_name': 'City', 'sort_index': 4},
  60. 'time': {'data': None, 'format': '{:20s}', 'included': True, 'human_name': 'Date/Time', 'sort_index': 5},
  61. 'time_diff': {'data': None, 'format': '{:8s}', 'included': True, 'human_name': 'Time diff', 'sort_index': 6},
  62. 'user_agent': {'data': None, 'format': '{:s}', 'included': True, 'human_name': 'User agent', 'sort_index': 7},
  63. 'http_request': {'data': None, 'format': '{:s}', 'included': True, 'human_name': 'Request', 'sort_index': 8}
  64. }
  65. return out_fields
  66. """
  67. Get default Apache HTTPD configuration file location
  68. """
  69. def get_apache_conf_path(self):
  70. path = None
  71. os_data_file = '/etc/os-release'
  72. conf_path = [
  73. { 'os_check_file': os_data_file, 'os_like': 'Arch Linux', 'path': '/etc/httpd/conf/httpd.conf'},
  74. { 'os_check_file': os_data_file, 'os_like': 'Debian', 'path': '/etc/apache2/apache2.conf'},
  75. { 'os_check_file': os_data_file, 'os_like': 'Ubuntu', 'path': '/etc/apache2/apache2.conf'},
  76. { 'os_check_file': os_data_file, 'os_like': 'Linux Mint', 'path': '/etc/apache2/apache2.conf'},
  77. { 'os_check_file': os_data_file, 'os_like': 'openSUSE', 'path': '/etc/apache2/httpd.conf'},
  78. { 'os_check_file': os_data_file, 'os_like': 'Gentoo', 'path': '/etc/apache2/httpd.conf'},
  79. { 'os_check_file': os_data_file, 'os_like': 'Red Hat', 'path': '/etc/httpd/conf/httpd.conf'},
  80. { 'os_check_file': os_data_file, 'os_like': 'Fedora', 'path': '/etc/httpd/conf/httpd.conf'}
  81. ]
  82. if self.check_file(os_data_file, "os.R_OK"):
  83. with open(os_data_file, 'r') as f:
  84. for line in f:
  85. if re.match('^[ ]?NAME=\"', line):
  86. for a in conf_path:
  87. if re.match('.*' + a['os_like'] + '.*', line):
  88. path = a['path']
  89. return path
  90. return path
  91. """
  92. Argument parser
  93. """
  94. def get_args(self):
  95. all_fields = self.get_out_fields()
  96. incl_fields = [i for i in all_fields.keys() if all_fields[i]['included']]
  97. out_time_format = "%d-%m-%Y %H:%M:%S"
  98. argparser = argparse.ArgumentParser(
  99. description = 'Apache HTTPD server log parser',
  100. formatter_class = argparse.ArgumentDefaultsHelpFormatter
  101. )
  102. argparser.add_argument(
  103. '-fr', '--files-regex',
  104. help = 'Apache log files matching input regular expression.',
  105. nargs = '?',
  106. dest = 'files_regex',
  107. required = False
  108. )
  109. argparser.add_argument(
  110. '-f', '--files-list',
  111. help = 'Apache log files.\nRegular expressions supported.',
  112. nargs = '?',
  113. type = lambda x: [i for i in x.split(',')],
  114. dest = 'files_list',
  115. required = False
  116. )
  117. argparser.add_argument(
  118. '-c', '--status-codes',
  119. help = 'Print only these numerical status codes.\nRegular expressions supported.',
  120. nargs = '+',
  121. dest = 'codes',
  122. required = False
  123. )
  124. argparser.add_argument(
  125. '-cf', '--countries',
  126. help = 'Include only these countries.\nNegative match (exclude): "\!Country"',
  127. nargs = '?',
  128. type = lambda x: [i for i in x.split(',')],
  129. dest = 'countries',
  130. required = False
  131. )
  132. argparser.add_argument(
  133. '-tf', '--time-format',
  134. help = 'Output time format.',
  135. nargs = '?',
  136. dest = 'time_format',
  137. )
  138. argparser.add_argument(
  139. '-if', '--included-fields',
  140. help = 'Included fields.\nAll fields: all, ' + ', '.join(all_fields),
  141. nargs = '?',
  142. dest = 'incl_fields',
  143. type = lambda x: [i for i in x.split(',')],
  144. default = ','.join(incl_fields)
  145. )
  146. argparser.add_argument(
  147. '-ef', '--excluded-fields',
  148. help = 'Excluded fields.',
  149. nargs = '?',
  150. dest = 'excl_fields',
  151. type = lambda x: [i for i in x.split(',')],
  152. default = None
  153. )
  154. argparser.add_argument(
  155. '-gl', '--geo-location',
  156. help = 'Check origin countries with external "geoiplookup" tool.\nNOTE: Automatically includes "country" and "city" fields.',
  157. action = 'store_true',
  158. dest = 'use_geolocation'
  159. )
  160. argparser.add_argument(
  161. '-ge', '--geotool-exec',
  162. help = '"geoiplookup" tool executable found in PATH.',
  163. nargs = '?',
  164. dest = 'geotool_exec',
  165. default = 'geoiplookup'
  166. )
  167. argparser.add_argument(
  168. '-gd', '--geo-database-dir',
  169. help = 'Database file directory for "geoiplookup" tool.',
  170. nargs = '?',
  171. dest = 'geo_database_location',
  172. default = '/usr/share/GeoIP/'
  173. )
  174. argparser.add_argument(
  175. '-dl', '--day-lower',
  176. help = 'Do not check log entries older than this day.\nDay syntax: 31-12-2020',
  177. nargs = '?',
  178. dest = 'date_lower'
  179. )
  180. argparser.add_argument(
  181. '-du', '--day-upper',
  182. help = 'Do not check log entries newer than this day.\nDay syntax: 31-12-2020',
  183. nargs = '?',
  184. dest = 'date_upper'
  185. )
  186. argparser.add_argument(
  187. '-sb', '--sort-by',
  188. help = 'Sort by an output field.',
  189. nargs = '?',
  190. dest = 'sortby_field'
  191. )
  192. argparser.add_argument(
  193. '-ro', '--reverse',
  194. help = 'Sort in reverse order.',
  195. dest = 'sortby_reverse',
  196. action = 'store_true'
  197. )
  198. argparser.add_argument(
  199. '-st', '--show-stats',
  200. help = 'Show short statistics at the end.',
  201. action = 'store_true',
  202. dest = 'show_stats'
  203. )
  204. argparser.add_argument(
  205. '-p', '--show-progress',
  206. help = 'Show progress information.',
  207. dest = 'show_progress',
  208. action = 'store_true'
  209. )
  210. argparser.add_argument(
  211. '--httpd-conf-file',
  212. help = 'Apache HTTPD configuration file with LogFormat directive.',
  213. dest = 'httpd_conf_file',
  214. default = self.get_apache_conf_path(),
  215. nargs = '?',
  216. type = str
  217. )
  218. argparser.add_argument(
  219. '--httpd-log-nickname',
  220. help = 'LogFormat directive nickname',
  221. action = 'store_true',
  222. dest = 'httpd_log_nickname',
  223. default = 'combinedio'
  224. )
  225. argparser.add_argument(
  226. '-lf', '--log-format',
  227. help = 'Log format, manually defined.',
  228. dest = 'log_format',
  229. required = False
  230. )
  231. argparser.add_argument(
  232. '-ph', '--print-header',
  233. help = 'Print column headers.',
  234. dest = 'column_headers',
  235. required = False,
  236. action = 'store_true'
  237. )
  238. argparser.add_argument(
  239. '--output-format',
  240. help = 'Output format for results.',
  241. dest = 'output_format',
  242. required = False,
  243. default = 'table',
  244. choices = ['table', 'csv']
  245. )
  246. argparser.add_argument(
  247. '--head',
  248. help = 'Read first N lines from all log entries.',
  249. dest = 'read_first_lines_num',
  250. required = False,
  251. nargs = '?',
  252. type = int
  253. )
  254. argparser.add_argument(
  255. '--tail',
  256. help = 'Read last N lines from all log entries.',
  257. dest = 'read_last_lines_num',
  258. required = False,
  259. nargs = '?',
  260. type = int
  261. )
  262. argparser.add_argument(
  263. '--sort-logs-by',
  264. help = 'Sorting order for input log files.',
  265. dest = 'sort_logs_by_info',
  266. required = False,
  267. default = 'name',
  268. choices = ['date', 'size', 'name']
  269. )
  270. argparser.add_argument(
  271. '--verbose',
  272. help = 'Verbose output.',
  273. dest = 'verbose',
  274. required = False,
  275. action = 'store_true'
  276. )
  277. args = argparser.parse_args()
  278. return args
  279. """
  280. Populate recognized HTTP status codes
  281. """
  282. def populate_status_codes(self):
  283. http_valid_codes = [
  284. '100-103',
  285. '200-208',
  286. '218'
  287. '226',
  288. '300-308',
  289. '400-431',
  290. '451',
  291. '500-511'
  292. ]
  293. codes = []
  294. for code in http_valid_codes:
  295. if len(code.split('-')) == 2:
  296. code_start = int(code.split('-')[0])
  297. code_end = int(code.split('-')[1])
  298. for i in range(code_start,code_end):
  299. codes.append(str(i))
  300. else:
  301. codes.append(code)
  302. return codes
  303. """
  304. Get valid HTTP status codes from user input
  305. """
  306. def get_input_status_codes(self, valid_codes, user_codes):
  307. codes = []
  308. for user_code in user_codes:
  309. user_code = str(user_code)
  310. validated = False
  311. code_appended = False
  312. for valid_code in valid_codes:
  313. if re.search(user_code, valid_code):
  314. validated = True
  315. code_appended = True
  316. codes.append((valid_code, validated))
  317. else:
  318. validated = False
  319. if not code_appended:
  320. codes.append((user_code, validated))
  321. self.txt.print_verbose('Available status codes', codes)
  322. return codes
  323. """
  324. Get log file list
  325. """
  326. def get_files(self, files_regex = None, files_list = None):
  327. files = []
  328. if files_regex is None and files_list is None:
  329. raise Exception("Either single file or regex file selection method is required.")
  330. if files_regex and files_list:
  331. raise Exception("Single file and regex file selection methods are mutually exclusive.")
  332. if files_regex:
  333. log_dir = '/'.join(files_regex.split('/')[:-1])
  334. file_part = files_regex.split('/')[-1]
  335. for lfile in os.listdir(log_dir):
  336. if os.path.isfile(log_dir + '/' + lfile):
  337. if re.match(file_part, lfile):
  338. files.append(log_dir + '/' + lfile)
  339. if files_list:
  340. for lfile in files_list:
  341. if os.path.isfile(lfile):
  342. files.append(lfile)
  343. if len(files) == 0:
  344. raise Exception("No matching files found.")
  345. files.sort()
  346. self.txt.print_verbose('Input files', files)
  347. return files
  348. """
  349. Common file checker
  350. """
  351. def check_file(self, sfile, flag, env = None):
  352. file_path = sfile
  353. if env is not None:
  354. for path in os.environ[env].split(os.pathsep):
  355. file_path = os.path.join(path, sfile)
  356. if os.path.isfile(file_path):
  357. break
  358. if os.access(file_path, eval(flag)):
  359. try:
  360. self.txt.print_verbose('File check', file_path, 'flags: ' + flag)
  361. except AttributeError:
  362. pass
  363. return True
  364. return False
  365. """
  366. Get Apache HTTPD LogFormat directive syntax
  367. """
  368. def get_httpd_logformat_directive(self, cfile, tag = None):
  369. if not self.check_file(cfile, "os.R_OK"):
  370. raise Exception("Couldn't open Apache HTTPD configuration file '{:s}'.".format(cfile))
  371. log_format = None
  372. self.txt.print_verbose('Apache configuration file', cfile)
  373. with open(cfile, 'r') as f:
  374. for line in f:
  375. if re.search('^[ ]+LogFormat ".*' + tag, line):
  376. r = re.search('^[ ]+LogFormat "(.*)(!?("))', line)
  377. log_format = r.groups()[0].replace('\\', '')
  378. break
  379. f.close()
  380. self.txt.print_verbose('Log format', log_format)
  381. return log_format
  382. """
  383. Geotool processing
  384. """
  385. def geotool_get_data(self, geotool_ok, geotool_exec, database_file, remote_host):
  386. host_country = None
  387. host_city = None
  388. if re.match('|'.join(self.private_class_ip_networks), remote_host):
  389. host_country = "Local"
  390. host_city = "Local"
  391. return {
  392. 'host_country': host_country,
  393. 'host_city': host_city
  394. }
  395. if geotool_ok:
  396. host_country_main = subprocess.check_output([geotool_exec,'-d', database_file, remote_host]).rstrip().decode()
  397. host_country_main = host_country_main.split('\n')
  398. try:
  399. host_country = host_country_main[0].split(', ')[1]
  400. except:
  401. if re.search("Address not found", host_country_main[0]):
  402. host_country = "Unknown"
  403. if len(host_country_main) > 1:
  404. try:
  405. host_city = host_country_main[1].split(', ')[4]
  406. if re.search("N/A", host_city):
  407. host_city = "Unknown: " + host_country_main[1].split(', ')[6] + ', ' + host_country_main[1].split(', ')[7]
  408. except:
  409. pass
  410. return {
  411. 'host_country': host_country,
  412. 'host_city': host_city
  413. }
  414. return None
  415. """
  416. Status code filter
  417. """
  418. def filter_status_code(self, status_codes, final_status):
  419. skip_line = True
  420. for status in status_codes:
  421. # Status consists of numerical status value (num) and validity boolean value (num_ok)
  422. if len(status) != 2:
  423. continue
  424. num, num_ok = status
  425. if num_ok:
  426. status = int(num)
  427. if status == final_status:
  428. skip_line = False
  429. break
  430. return skip_line
  431. """
  432. Country name filter
  433. """
  434. def filter_country(self, countries, host_country):
  435. skip_line = True
  436. for country in countries:
  437. if country[1] == "!":
  438. country = country[2:]
  439. if country.lower() == host_country.lower():
  440. skip_line = True
  441. break
  442. else:
  443. skip_line = False
  444. elif country.lower() == host_country.lower():
  445. skip_line = False
  446. break
  447. return skip_line
  448. """
  449. Get lines to be processed from input files and min/max input
  450. min and max work much like Unix tools 'head' and 'tail'
  451. Only a single value (min or max) is allowed
  452. """
  453. def get_file_lines_head_tail(self, sfiles, line_range_min = None, line_range_max = None, files_order = None):
  454. files_and_lines = {'files': [], 'lines_total': 0, 'range_min': 0, 'range_max': 0}
  455. files_tmp = []
  456. lines_count = 0
  457. line_start = 0
  458. line_end = 0
  459. if line_range_min and line_range_max:
  460. raise Exception("Either first or last line limit can be used, not both.")
  461. if files_order is None:
  462. raise Exception("Sorting order for input files missing.")
  463. if line_range_min is not None:
  464. if line_range_min < 0:
  465. line_range_min = None
  466. if line_range_max is not None:
  467. if line_range_max < 0:
  468. line_range_max = None
  469. for sfile in sfiles:
  470. if not self.check_file(sfile, "os.R_OK"):
  471. raise Exception("Couldn't read input file '{}'.".format(sfile))
  472. with open(sfile, 'r') as f:
  473. line_count = len(list(f))
  474. f.close()
  475. files_tmp.append({
  476. 'file': str(sfile),
  477. 'modified_date': os.path.getmtime(sfile),
  478. 'size': os.path.getsize(sfile),
  479. 'line_count': line_count
  480. })
  481. if files_order == 'date':
  482. files_tmp.sort(key = lambda d: d['modified_date'])
  483. elif files_order == 'size':
  484. files_tmp.sort(key = lambda d: d['size'])
  485. elif files_order == 'name':
  486. files_tmp.sort(key = lambda d: d['file'])
  487. i = 0
  488. for sfile in files_tmp:
  489. line_end = (line_start + sfile['line_count']) - 1
  490. files_and_lines['files'].append({
  491. 'file': sfile['file'],
  492. 'line_start_global': line_start,
  493. 'line_end_global': line_end,
  494. 'line_start_local': 0,
  495. 'line_end_local': sfile['line_count'] - 1,
  496. })
  497. lines_count += line_count
  498. line_start = files_and_lines['files'][i]['line_end_global'] + 1
  499. i += 1
  500. range_line_start = files_and_lines['files'][0]['line_start_global']
  501. full_range = files_and_lines['files'][-1]['line_end_global']
  502. files_and_lines['range_min'] = range_line_start
  503. files_and_lines['range_max'] = full_range
  504. files_and_lines['lines_total'] = full_range - range_line_start
  505. i = 0
  506. # Read last N lines
  507. if line_range_max is not None:
  508. range_start = full_range - line_range_max
  509. if range_start <= 0:
  510. range_start = 0
  511. for l in files_and_lines['files']:
  512. if range_start >= l['line_start_global'] and range_start <= l['line_end_global']:
  513. l['line_start_global'] = range_start
  514. l['line_start_local'] = l['line_end_local'] - (l['line_end_global'] - range_start)
  515. del files_and_lines['files'][:i]
  516. i += 1
  517. # Read first N lines
  518. if line_range_min is not None:
  519. range_end = line_range_min
  520. if range_end >= full_range:
  521. range_end = full_range
  522. for l in files_and_lines['files']:
  523. if range_end >= l['line_start_global'] and range_end <= l['line_end_global']:
  524. l['line_end_local'] = l['line_end_local'] - l['line_start_local'] - (l['line_end_global'] - range_end)
  525. l['line_end_global'] = range_end
  526. del files_and_lines['files'][i + 1:]
  527. i += 1
  528. return files_and_lines
  529. """
  530. Get lines to be processed from input files and range input
  531. Range: <min> - <max>
  532. """
  533. def get_file_lines_range(self, sfiles, line_range_min=None, line_range_max=None):
  534. files_and_lines = {'files': [], 'lines_total': 0, 'range_min': 0, 'range_max': 0}
  535. lines_count = 0
  536. line_start = 0
  537. line_end = 0
  538. range_line_start = 0
  539. range_line_end = 0
  540. range_line_start_found = False
  541. if line_range_min is not None:
  542. if line_range_min < 0:
  543. line_range_min = None
  544. if line_range_max is not None:
  545. if line_range_max < 0:
  546. line_range_max = None
  547. for sfile in sfiles:
  548. append = False
  549. if not self.check_file(sfile, "os.R_OK"):
  550. raise Exception("Couldn't read input file '{}'.".format(sfile))
  551. with open(sfile, 'r') as f:
  552. line_count = len(list(f))
  553. f.close()
  554. line_end = line_start + line_count
  555. if line_range_min is not None:
  556. if line_range_min >= line_start and line_range_min <= line_end:
  557. append = True
  558. line_start = line_range_min
  559. if line_range_min is None and line_end < line_range_max:
  560. append = True
  561. if line_range_max is not None:
  562. if line_range_max >= line_start and line_range_max <= line_end:
  563. append = True
  564. line_end = line_range_max
  565. if line_range_min < line_end and line_range_max > line_end:
  566. append = True
  567. if line_range_max is None and line_start > line_range_min:
  568. append = True
  569. if append:
  570. files_and_lines['files'].append({
  571. 'file': str(sfile),
  572. 'line_start_global': line_start,
  573. 'line_end_global': line_end,
  574. 'modified_date': os.path.getmtime(sfile),
  575. 'size': os.path.getsize(sfile)
  576. })
  577. # Use only the first matching line_start value
  578. if not range_line_start_found:
  579. range_line_start_found = True
  580. range_line_start = line_start
  581. # Use the last matching line_end value
  582. range_line_end = line_end
  583. lines_count += line_count
  584. line_start = lines_count + 1
  585. files_and_lines['lines_total'] = range_line_end - range_line_start
  586. files_and_lines['range_min'] = range_line_start
  587. files_and_lines['range_max'] = range_line_end
  588. return files_and_lines
  589. """
  590. Date checker
  591. """
  592. def date_checker(self, date_lower, date_upper, entry_time):
  593. # TODO Handle situations where date_upper & date_lower are equal
  594. if date_upper is not None and date_lower is not None:
  595. if date_lower > date_upper:
  596. raise Exception("Earlier day can't be later than later day")
  597. if date_upper is not None:
  598. if date_upper > datetime.now():
  599. raise Exception("Day can't be in the future")
  600. if date_lower is not None:
  601. if date_lower > datetime.now():
  602. raise Exception("Day can't be in the future")
  603. if date_lower is not None:
  604. if entry_time <= date_lower: return False
  605. if date_upper is not None:
  606. if entry_time >= date_upper: return False
  607. return True
  608. """
  609. Get output field definitions (sortby)
  610. """
  611. def get_out_field(self, fields, field_input):
  612. i = 0
  613. for field in fields:
  614. if field == field_input:
  615. return [True, i]
  616. i += 1
  617. return [False, i]
  618. """
  619. Get included fields
  620. """
  621. def get_included_fields(self, fields, included_fields, excluded_fields=None):
  622. if included_fields:
  623. # TODO: simplify logic
  624. n = 0
  625. included_fields = [[i.replace(' ',''), 0] for i in included_fields]
  626. for a in included_fields:
  627. a[1] += n
  628. n += 1
  629. if excluded_fields:
  630. excluded_fields = [i.replace(' ','') for i in excluded_fields]
  631. all_defined_fields = []
  632. fields_out = {}
  633. if 'all' in included_fields or included_fields is None:
  634. included_fields = [[i, int(i['sort_index'])] for i in fields.keys()]
  635. if excluded_fields is not None:
  636. if 'all' in excluded_fields:
  637. raise Exception("No output fields defined.")
  638. # TODO: simplify logic
  639. n = 0
  640. included_fields = [[i, 0] for i in included_fields if i not in excluded_fields]
  641. for a in included_fields:
  642. a[1] += n
  643. n += 1
  644. all_defined_fields = [i[0] for i in included_fields] + excluded_fields
  645. else:
  646. all_defined_fields = included_fields
  647. for i in all_defined_fields:
  648. if i[0] not in fields.keys():
  649. raise Exception("Unknown field value: {}. Accepted values: {}".format(i, ','.join(fields.keys())))
  650. for a in included_fields:
  651. for key, value in fields.items():
  652. if key == a[0]:
  653. value['sort_index'] = a[1]
  654. value['included'] = True
  655. fields_out[key] = value
  656. if len(fields_out.keys()) == 0:
  657. raise Exception("No output fields defined.")
  658. return fields_out
  659. """
  660. Process input files
  661. """
  662. def process_files(self):
  663. prev_host = ""
  664. log_entries = []
  665. codes = []
  666. countries = []
  667. # Log format as defined in Apache/HTTPD configuration file (LogFormat directive) or manually by user
  668. if self.args.log_format:
  669. log_format = self.args.log_format
  670. else:
  671. log_format = self.get_httpd_logformat_directive(self.args.httpd_conf_file, self.args.httpd_log_nickname)
  672. # Remove bytes in & out fields from local traffic pattern
  673. log_format_local = log_format.replace('%I','').replace('%O','').strip()
  674. parser = LogParser(log_format)
  675. parser_local = LogParser(log_format_local)
  676. if self.args.codes:
  677. codes = self.get_input_status_codes(self.populate_status_codes(), self.args.codes)
  678. if self.args.countries:
  679. countries = self.args.countries
  680. date_lower = self.args.date_lower
  681. date_upper = self.args.date_upper
  682. day_format = "%d-%m-%Y"
  683. if date_lower is not None:
  684. date_lower = datetime.strptime(date_lower, day_format)
  685. if date_upper is not None:
  686. date_upper = datetime.strptime(date_upper, day_format)
  687. geotool_exec = self.args.geotool_exec
  688. geo_database_location = self.args.geo_database_location
  689. incl_fields = self.args.incl_fields
  690. if isinstance(self.args.incl_fields, str):
  691. incl_fields = self.args.incl_fields.split(',')
  692. use_geolocation = self.args.use_geolocation
  693. geotool_ok = False
  694. if use_geolocation:
  695. if self.check_file(geotool_exec, "os.X_OK", "PATH") and self.check_file(geo_database_location, "os.R_OK"):
  696. geotool_ok = True
  697. if use_geolocation:
  698. if 'country' not in incl_fields:
  699. incl_fields.append('country')
  700. if 'city' not in incl_fields:
  701. incl_fields.append('city')
  702. if 'country' in incl_fields or 'city' in incl_fields:
  703. use_geolocation = True
  704. fields = self.get_included_fields(
  705. self.get_out_fields(),
  706. incl_fields,
  707. self.args.excl_fields
  708. )
  709. invalid_lines = []
  710. field_names = []
  711. country_seen = False
  712. geo_data = None
  713. skip_line_by_status = False
  714. skip_line_by_country = False
  715. file_num = 0
  716. stri = ""
  717. files_input = self.get_files(self.args.files_regex, self.args.files_list)
  718. files_process_data = self.get_file_lines_head_tail(
  719. files_input,
  720. self.args.read_first_lines_num,
  721. self.args.read_last_lines_num,
  722. self.args.sort_logs_by_info
  723. )
  724. lines_total = files_process_data['lines_total']
  725. files_total = len(files_process_data['files'])
  726. self.txt.print_verbose(
  727. 'Log entry range',
  728. str(files_process_data['files'][0]['line_start_global'])
  729. + ' - ' +
  730. str(files_process_data['files'][-1]['line_end_global'])
  731. )
  732. if self.args.show_progress or self.args.verbose:
  733. print(
  734. "File count: {}\nLines in total: {}".format(
  735. str(files_total),
  736. str(lines_total)
  737. ))
  738. for lfile in files_process_data['files']:
  739. if self.args.show_progress or self.args.verbose:
  740. print("Processing file: {:s} (lines: {:d}-{:d})".format(
  741. lfile['file'],
  742. lfile['line_start_global'], lfile['line_end_global']
  743. ))
  744. if not self.check_file(lfile['file'], "os.R_OK"):
  745. raise Exception("Couldn't read input file '{}'.".format(lfile['file']))
  746. with open(lfile['file'], 'r') as f:
  747. f = list(f)
  748. range_start = files_process_data['files'][file_num]['line_start_local']
  749. range_end = files_process_data['files'][file_num]['line_end_local']
  750. lines = range(range_start, range_end)
  751. line_num = 1
  752. for line in lines:
  753. if self.args.show_progress or self.args.verbose:
  754. print("Processing log entry: {:d}/{:d} ({}%)".format(
  755. line_num,
  756. len(lines),
  757. round(100 * (line_num/len(lines)), 2)
  758. ), end = "\r")
  759. if line_num != 1 and not (skip_line_by_status or skip_line_by_country) and entry_data:
  760. prev_host = entry_data['remote_host']
  761. prev_host_time = entry_data['time']
  762. try:
  763. if re.match('|'.join(self.private_class_ip_networks), f[line]):
  764. entry = parser_local.parse(f[line])
  765. else:
  766. entry = parser.parse(f[line])
  767. except InvalidEntryError:
  768. invalid_lines.append((lfile['file'], line_num))
  769. line_num += 1
  770. continue
  771. entry_data = {
  772. 'time': entry.request_time.replace(tzinfo = None),
  773. 'user_agent': entry.headers_in["User-Agent"],
  774. 'http_request': str(entry.request_line).encode('unicode_escape').decode(),
  775. 'remote_host': entry.remote_host,
  776. 'status': entry.final_status
  777. }
  778. if not self.date_checker(date_lower, date_upper, entry_data['time']):
  779. line_num += 1
  780. continue
  781. if len(codes) > 0:
  782. skip_line_by_status = self.filter_status_code(codes, entry_data['status'])
  783. if use_geolocation:
  784. if prev_host == entry_data['remote_host']:
  785. country_seen = True
  786. else:
  787. country_seen = False
  788. if not country_seen:
  789. geo_data = self.geotool_get_data(geotool_ok, geotool_exec, geo_database_location, entry_data['remote_host'])
  790. if len(countries) > 0 and geo_data is not None:
  791. skip_line_by_country = self.filter_country(countries, geo_data['host_country'])
  792. else:
  793. skip_line_by_country = False
  794. if skip_line_by_status or skip_line_by_country:
  795. line_num += 1
  796. continue
  797. time_diff = str('NEW_CONN')
  798. if prev_host == entry_data['remote_host']:
  799. time_diff = (entry_data['time'] - prev_host_time).total_seconds()
  800. if isinstance(time_diff, float):
  801. time_diff = int(time_diff)
  802. if time_diff > 0:
  803. time_diff = "+" + str(time_diff)
  804. if line_num == 1 and file_num == 0:
  805. time_diff = int(0)
  806. if 'log_file_name' in fields:
  807. fields['log_file_name']['data'] = lfile
  808. if 'http_status' in fields:
  809. fields['http_status']['data'] = entry_data['status']
  810. if 'remote_host' in fields:
  811. fields['remote_host']['data'] = entry_data['remote_host']
  812. if geo_data is not None:
  813. if 'country' in fields:
  814. fields['country']['data'] = geo_data['host_country']
  815. if 'city' in fields:
  816. fields['city']['data'] = geo_data['host_city']
  817. if 'time' in fields:
  818. fields['time']['data'] = entry_data['time']
  819. if 'time_diff' in fields:
  820. fields['time_diff']['data'] = time_diff
  821. if 'user_agent' in fields:
  822. fields['user_agent']['data'] = entry_data['user_agent']
  823. if 'http_request' in fields:
  824. fields['http_request']['data'] = entry_data['http_request']
  825. stri = ""
  826. printargs = []
  827. for key, value in fields.items():
  828. if not use_geolocation and (key == 'country' or key == 'city'):
  829. continue
  830. if value['included']:
  831. stri += "\t" + value['format']
  832. printargs.append(value['data'])
  833. if not any(key in i for i in field_names):
  834. field_names.append((key, value['human_name']))
  835. log_entries.append(printargs)
  836. line_num += 1
  837. file_num += 1
  838. return [log_entries, files_process_data['files'], lines_total, stri, field_names, invalid_lines]
  839. """
  840. Execute
  841. """
  842. def execute(self):
  843. print_headers = self.args.column_headers
  844. show_progress = self.args.show_progress
  845. show_stats = self.args.show_stats
  846. output_format = self.args.output_format
  847. sortby_field = self.args.sortby_field
  848. reverse_order = self.args.sortby_reverse
  849. if self.args.incl_fields:
  850. if 'all' not in self.args.incl_fields:
  851. if sortby_field and sortby_field not in self.args.incl_fields:
  852. raise Exception("Sort-by field must be included in output fields.")
  853. results = self.process_files()
  854. result_entries = results[0]
  855. result_files = results[1]
  856. result_lines = results[2]
  857. stri = results[3]
  858. out_fields = [i[0] for i in results[4]]
  859. out_fields_human_names = [i[1] for i in results[4]]
  860. invalid_lines = results[5]
  861. if sortby_field is None and reverse_order:
  862. raise Exception("You must define a field for reverse sorting.")
  863. if sortby_field is not None:
  864. out_field_validation = self.get_out_field(out_fields, sortby_field)
  865. if out_field_validation[0]:
  866. result_entries.sort(
  867. key = lambda r : r[out_field_validation[1]] or '',
  868. reverse = reverse_order
  869. )
  870. if output_format == 'table':
  871. if print_headers:
  872. print("\n")
  873. print(stri.format(*out_fields_human_names).lstrip())
  874. for entry in result_entries:
  875. c = 0
  876. entry_items = []
  877. while c < len(entry):
  878. entry_items.append(str(entry[c]))
  879. c += 1
  880. print(stri.format(*entry_items).lstrip())
  881. if output_format == 'csv':
  882. if print_headers:
  883. print(','.join(out_fields_human_names))
  884. for entry in result_entries:
  885. c = 0
  886. entry_items = []
  887. while c < len(entry):
  888. entry_items.append(str(entry[c]))
  889. c += 1
  890. print(','.join(entry_items))
  891. if show_stats:
  892. print(("\n" +
  893. "Processed files: {:s}\n" +
  894. "Processed log entries: {:d}\n" +
  895. "Matched log entries: {:d}\n"
  896. ).format(
  897. ', '.join([i['file'] for i in result_files['files']]),
  898. result_lines,
  899. len(result_entries)
  900. )
  901. )
  902. if len(invalid_lines) > 0:
  903. print("Invalid lines:")
  904. for i in invalid_lines:
  905. print("\tFile: {:s}, line: {:d}".format(i[0], i[1]))
  906. print("\n")
  907. if __name__ == "__main__":
  908. app = program()
  909. app.execute()