From 06496804597cfe7173c988abed337beb1c8cb821 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lauri=20V=C3=B5sandi?= Date: Sat, 16 Jun 2018 00:21:13 +0300 Subject: [PATCH] =?UTF-8?q?Apache=20parsija=20t=C3=A4ielik=20n=C3=A4idis?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- raport.py | 34 +++++++++++ raport/README.md | 7 +++ raport/log_parsers.py | 49 ++++++++++++++++ raport/main.py | 129 ++++++++++++++++++++++++++++++++++++++++++ raport/template.html | 103 +++++++++++++++++++++++++++++++++ 5 files changed, 322 insertions(+) create mode 100644 raport.py create mode 100644 raport/README.md create mode 100644 raport/log_parsers.py create mode 100644 raport/main.py create mode 100644 raport/template.html diff --git a/raport.py b/raport.py new file mode 100644 index 0000000..87c3ecd --- /dev/null +++ b/raport.py @@ -0,0 +1,34 @@ +#!/usr/bin/python +# encoding: utf-8 + +""" +Parsi standardsisendist Apache logikirjed ja kuva edetabelid +Käivitamiseks: (cat access.log; zcat access.log.1.gz) | python raport.py +""" + +import re +import sys +from collections import Counter +from log_parsers import ApacheLogParser + +hits = Counter() # IP-d kust tuldi +urls = Counter() # URL-id mida külastati +agents = Counter() # User agent mida kasutati külastamisel + +for timestamp, log_entry, stack_trace in ApacheLogParser(sys.stdin): + urls[log_entry.get("path")] += 1 + hits[log_entry.get("remote_addr")] += 1 + agents[log_entry.get("user_agent")] += 1 + +print "Top5 külastatud URL-id veebiserveris:" +for path, count in urls.most_common(5): + sys.stdout.write("% 9d %s\n" % (count, path)) +print +print "Top5 külastusi teinud IP aadressid:" +for remote_addr, count in hits.most_common(5): + sys.stdout.write("% 9d %s\n" % (count, remote_addr)) +print +print "Top5 kasutatud veebilehitsejad/OS-id:" +for user_agent, count in agents.most_common(5): + sys.stdout.write("% 9d %s\n" % (count, user_agent)) + diff --git a/raport/README.md b/raport/README.md new file mode 100644 index 0000000..62a74d2 --- /dev/null +++ b/raport/README.md @@ -0,0 +1,7 @@ +# Apache logide parsija + +Komplektne näide kuidas Apache2 logisid parsida ning raporteerida huvitavamad killud: + +* Parsib Apache logifaili kirjed ApacheLogParser klassi abil mis on kirjeldatud failis log_parsers.py +* Värvib kaardi faili BlankMap-World6.svg ning salvestab top.svg faili sisse +* Genereerib Bootstrap baasil koostatud veebilehe mallist template.html faili raport.html diff --git a/raport/log_parsers.py b/raport/log_parsers.py new file mode 100644 index 0000000..03301a2 --- /dev/null +++ b/raport/log_parsers.py @@ -0,0 +1,49 @@ +# encoding: utf-8 +import re +import sys +from datetime import datetime + +class GenericParser(object): + def __init__(self, fh): # siia võid anda sys.stdin, gzip.open, open vms file handle tüüpi obj + self.fh = fh + + def __iter__(self): + multiline_message = "" + log_entry = None + byte_count = 0 + line_count = 0 + event_count = 0 + for line in self.fh: + byte_count += len(line) # loenda baite + line_count += 1 + + if not line.strip(): # jäta vahele tühjad read + continue + m = re.match(self.RE_LOG_ENTRY, line) + if m: + if log_entry and self.is_serious(log_entry): + stack_trace = "\n".join(multiline_message.split("\n")[1:]) + event_count += 1 + row = \ + datetime.strptime(log_entry.get("timestamp"), self.TIMESTAMP_FORMAT), \ + log_entry, stack_trace, byte_count, line_count, event_count + # See teeb funktsioonist generaatori/iteraatori + yield row + multiline_message = line + log_entry = m.groupdict() + else: + multiline_message += line + +class JavaLogParser(GenericParser): + TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f" + RE_LOG_ENTRY = "(?P.+?) (?P[A-Z]+) 1 --- \[(?P.+)\](?P.+) +: (?P.+)" + + def is_serious(self, log_entry): + return log_entry.get("severity") == "ERROR" + +class ApacheLogParser(GenericParser): + RE_LOG_ENTRY = "(?P.+?) - (?P.+?) \[(?P.+?) \+\d\d\d\d\] \"(?P[A-Z]+) (?P.+) HTTP/1.[01]\" (?P\d+) (?P\d+) \"(?P.+?)\" \"(?P.+?)\"" + TIMESTAMP_FORMAT = "%d/%b/%Y:%H:%M:%S" + + def is_serious(self, log_entry): + return int(log_entry.get("status")) >= 400 diff --git a/raport/main.py b/raport/main.py new file mode 100644 index 0000000..136e50f --- /dev/null +++ b/raport/main.py @@ -0,0 +1,129 @@ +#!/usr/bin/python +# encoding: utf-8 + +""" +Sõltuvuste paigladamiseks: + + apt install -y python-jinja2 python-lxml python-pygal + dnf install -y python-jinja2 python-lxml python-pygal +""" + +import GeoIP +import re +import sys +from collections import Counter +from datetime import datetime +from log_parsers import ApacheLogParser + +then = datetime.now() +hits_per_remote_addr = Counter() # IP-d kust tuldi +hits_per_path = Counter() # URL-id mida külastati +hits_per_user_agent = Counter() # User agent mida kasutati külastamisel +hits_per_country = Counter() # Riigid kust päringud tulid +hits_per_date = Counter() # Kuupäevad mil logikirjeid oli +bytes_per_date = Counter() + +gi = GeoIP.open("/usr/share/GeoIP/GeoIP.dat", GeoIP.GEOIP_MEMORY_CACHE) + +print "Loen standardsisendist..." +for timestamp, log_entry, stack_trace, byte_count, line_count, event_count in ApacheLogParser(sys.stdin): + country_code = gi.country_code_by_addr(log_entry.get("remote_addr")) + hits_per_path[log_entry.get("path")] += 1 + hits_per_remote_addr[log_entry.get("remote_addr")] += 1 + hits_per_user_agent[log_entry.get("user_agent")] += 1 + hits_per_country[country_code] += 1 + hits_per_date[timestamp.date()] += 1 + bytes_per_date[timestamp.date()] += int(log_entry.get("size")) + +# Leia kõige esimene kuupäev ning viimane kuupäev millal sündmused esinesid +first_date, last_date = min(hits_per_date.keys()), max(hits_per_date.keys()) + +############################ +### Värvi riigid kaardil ### +############################ + +import requests +from lxml import etree +from numpy import interp + +# Laadi alla kaart wikimedia veebist +print "Laadin alla kaarti..." +buf = requests.get("https://upload.wikimedia.org/wikipedia/commons/0/03/BlankMap-World6.svg").content + +# Parsi XML puu +map_document = etree.fromstring(buf) +for country, count in hits_per_country.items(): + if not country: + # Mõni IP ei pruukinud laheneda riigikoodiks (sisevõrk jms) + continue + hue = interp(count, [0, max(hits_per_country.values())], [180, 0]) + for element in map_document.xpath("//*[@id='%s']" % country.lower()): + element.set("style", "fill:hsl(%.2f, 60%%, 60%%)" % hue) + for subelement in element: + subelement.attrib.pop("class", "") + +with open("top.svg", "wb") as fh: + fh.write(etree.tostring(map_document)) +print "Kaart salvestatud faili top.svg" + + +#################################################### +### Koosta päringute arvu graafik päevade lõikes ### +#################################################### + +import matplotlib.pyplot as plt +fig = plt.figure(figsize=(10, 7)) + +# Lisa joonise sisse kaks graafikut +sub = fig.add_subplot(2, 1, 1) +sub2 = fig.add_subplot(2, 1, 2) + +sub.set_xlabel(u"Päringute arv") +sub2.set_xlabel(u"Liikluse maht baitides") + + +# Ploti andmepunktid +sub.barh(hits_per_date.keys(), hits_per_date.values()) +sub2.barh(bytes_per_date.keys(), bytes_per_date.values()) + +# Salvesta faili +fig.savefig("bar.svg", format="svg") +fig.savefig("bar.png") + + +####################################### +### Koosta veebilehitsejate graafik ### +####################################### + +import pygal +line_chart = pygal.Pie( + truncate_legend=50, # legend kuni 50 karakterit + width=1000, height=300, # graafiku laius/kõrgus pikslites + style=pygal.style.Style(background='transparent')) # eemalda taustavärv +line_chart.config(style_name = 'LightStyle', fill=None) +line_chart.title = 'Veebilehitsejate osakaal' +for user_agent, count in hits_per_user_agent.most_common(7): + line_chart.add(user_agent, count) # lisa graafikule + hits_per_user_agent.pop(user_agent) # eemalda counteri objektist +line_chart.add("Muud veebilehitsejad", sum(hits_per_user_agent.values())) + +user_agent_chart = line_chart.render(is_unicode=True, disable_xml_declaration=True) + + + +from jinja2 import Template +import codecs + +# Loe jinja mall UTF-8 tekstifailist +with codecs.open("template.html", "rb", encoding="utf-8") as fh: + template = Template(fh.read()) + # Süsti malli sisse kõik kohalikud muutujad (first_date, last_date, hits, urls jne) + buf = template.render(locals()) + + # Salvesta täidetud leht UTF-8 kodeeringus + with codecs.open("raport.html", "wb", encoding="utf-8") as fh: + fh.write(buf) + +print "HTML kujul raport savlestatud faili raport.html" + + diff --git a/raport/template.html b/raport/template.html new file mode 100644 index 0000000..9916b19 --- /dev/null +++ b/raport/template.html @@ -0,0 +1,103 @@ + + + + Apache logide raport + + + + + + + + + + +
+
+
+
+
+
+

Sisendandmed

+

+ Läbi näritud {{ byte_count | filesizeformat }} andmeid, + {{ line_count }} rida, {{ event_count }} logikirjet. + Logikirjed + {% if first_date == last_date %} + päeval {{ first_date }} + {% else %} + {{ first_date }} kuni {{ last_date }} + {% endif %} +

+
+
+
+
+

Kuupäevad

+

Päringute arv ning päringute maht kuupäevade kaupa grupeeritult, + ploteeritud matplotlib abil

+ +

Top 10 veebilehitsejad

+

Interaktiivne graafik ploteeritud pygal abil +

+ {{ user_agent_chart }} +
+
+
+
+
+

Top 10 URL-id

+
+
    + {% for path, count in hits_per_path.most_common(10) %} +
  1. {{ path }}: {{ count }}
  2. + {% endfor %} +
+
+
+
+
+
+

Top10 IP aadressid

+
+
    + {% for remote_addr, count in hits_per_remote_addr.most_common(10) %} +
  1. {{ remote_addr }}: {{ count }}
  2. + {% endfor %} +
+
+
+
+
+
+

Külastused riikide kaupa

+

Loetud XML failist, värvitud lxml mooduli abil ning salvestatud top.svg faili sisse:

+
+ +
+
+
+
+
+

Fail genereeriti {{ datetime.now() }}, võttis aega {{ datetime.now() - then }}

+
+
+
+ +