logide-parsimine/bin/top5

52 lines
1.7 KiB
Python
Executable File

#!/usr/bin/python
# encoding: utf-8
import argparse
import re
from datetime import datetime, timedelta
import sys
from collections import Counter
hits = Counter() # IP-d kust tuldi
urls = Counter() # URL-id mida külastati
agents = Counter() # User agent mida kasutati külastamisel
RE_LOG_ENTRY = "(?P<remote_addr>.+?) - (?P<username>.+?) \[(?P<timestamp>.+?) \+\d\d\d\d\] \"(?P<verb>[A-Z]+) (?P<path>.+) HTTP/1.[01]\" (?P<status>\d+) (?P<size>\d+) \"(?P<referrer>.+?)\" \"(?P<agent>.+?)\""
for line in sys.stdin:
m = re.match(RE_LOG_ENTRY, line)
if not m:
sys.stderr.write("Ei suutnud parsida rida: %s" % line)
continue
# Ignoreeri localhostist ja sisevõrgust pärinevaid päringuid (a'la nagios)
if m.group("remote_addr").startswith("127.") or m.group("remote_addr").startswith("192.168."):
continue
# Jäta vahele OPTIONS päringud
if m.group("verb") not in ("GET", "POST"):
continue
# Jäta vahele botid ja otsingumootorid
if re.search("(crawler|spider|Nuhk|Googlebot|yahoo|yandex)", m.group("agent")):
continue
dt = datetime.strptime(m.group("timestamp"), "%d/%b/%Y:%H:%M:%S")
hits[m.group("remote_addr")] += 1
urls[m.group("path")] += 1
agents[m.group("agent")] += 1
print "Top 5 enim külastatud URL-i veebiserveris:"
for path, count in urls.most_common(5):
sys.stdout.write("% 9d %s\n" % (count, path))
print
print "Top 5 enim külastusi teinud IP aadressid:"
for remote_addr, count in hits.most_common(5):
sys.stdout.write("% 9d %s\n" % (count, remote_addr))
print
print "Top 5 enim kasutatud veebilehitsejad/OS-id:"
for user_agent, count in agents.most_common(5):
sys.stdout.write("% 9d %s\n" % (count, user_agent))