logide-parsimine/raport/log_parsers.py

69 lines
2.6 KiB
Python
Raw Normal View History

2018-06-15 21:21:13 +00:00
# encoding: utf-8
import re
import sys
from datetime import datetime
class GenericParser(object):
2018-06-15 21:55:57 +00:00
def __init__(self, fh, errors_only=False): # siia võid anda sys.stdin, gzip.open, open vms file handle tüüpi obj
2018-06-15 21:21:13 +00:00
self.fh = fh
2018-06-15 21:55:57 +00:00
self.errors_only = errors_only
2018-06-15 21:21:13 +00:00
def __iter__(self):
multiline_message = ""
log_entry = None
byte_count = 0
line_count = 0
event_count = 0
for line in self.fh:
2018-06-17 20:14:16 +00:00
# Loenda logikirjete arv ja maht
byte_count += len(line)
2018-06-15 21:21:13 +00:00
line_count += 1
2018-06-17 20:14:16 +00:00
# Normaliseeri reavahetused
line = line.replace("\r\n", "\n")
# Jäta vahele tühjad read
if not line.strip():
2018-06-15 21:21:13 +00:00
continue
2018-06-17 20:14:16 +00:00
# Püüa regulaaravaldise järgi rida tükkideks võtta
2018-06-15 21:21:13 +00:00
m = re.match(self.RE_LOG_ENTRY, line)
2018-06-17 20:14:16 +00:00
# Kui rida klappis regexiga
2018-06-15 21:21:13 +00:00
if m:
2018-06-17 20:14:16 +00:00
# Väljasta eelmine kokku kleebitud logikirje
2018-06-15 21:55:57 +00:00
if log_entry:
2018-06-17 20:14:16 +00:00
# Kui vaja filtreerida, välista logikirjed mis ei ole veateatega seotud
if not self.errors_only or self.is_serious(log_entry):
event_count += 1
# yield teeb funktsioonist generaatori/iteraatori
# https://pythontips.com/2013/09/29/the-python-yield-keyword-explained/
yield datetime.strptime(log_entry.get("timestamp"), self.TIMESTAMP_FORMAT), \
log_entry, multiline_message, byte_count, line_count, event_count
# Alusta järgmise sõnumi kokku kleepimist
multiline_message = m.group("message")
2018-06-15 21:21:13 +00:00
log_entry = m.groupdict()
2018-06-17 20:14:16 +00:00
elif line.startswith("\t") or line.startswith("Caused by") or line.startswith("org."):
2018-06-15 21:21:13 +00:00
multiline_message += line
2018-06-17 20:14:16 +00:00
else:
sys.stderr.write("Ei suutnud parsida rida:" + line)
2018-06-15 21:21:13 +00:00
class JavaLogParser(GenericParser):
TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
2018-06-17 20:14:16 +00:00
RE_LOG_ENTRY = "(?P<timestamp>.+?) +(?P<severity>[A-Z]+) 1 --- \[(?P<thread>.+)\](?P<class>.+?) *: (?P<message>.+)"
2018-06-15 21:21:13 +00:00
def is_serious(self, log_entry):
return log_entry.get("severity") == "ERROR"
class ApacheLogParser(GenericParser):
2018-06-17 20:14:16 +00:00
RE_LOG_ENTRY = "(?P<remote_addr>.+?) - (?P<username>.+?) \[(?P<timestamp>.+?) \+\d\d\d\d\] \"(?P<message>(?P<verb>[A-Z]+) (?P<path>.+) HTTP/1.[01])\" (?P<status>\d+) (?P<size>\d+) \"(?P<referrer>.+?)\" \"(?P<user_agent>.+?)\""
2018-06-15 21:21:13 +00:00
TIMESTAMP_FORMAT = "%d/%b/%Y:%H:%M:%S"
def is_serious(self, log_entry):
return int(log_entry.get("status")) >= 400