#!/usr/bin/python # encoding: utf-8 import argparse import re from datetime import datetime, timedelta import sys from collections import Counter hits = Counter() # IP-d kust tuldi urls = Counter() # URL-id mida külastati agents = Counter() # User agent mida kasutati külastamisel RE_LOG_ENTRY = "(?P.+?) - (?P.+?) \[(?P.+?) \+\d\d\d\d\] \"(?P[A-Z]+) (?P.+) HTTP/1.[01]\" (?P\d+) (?P\d+) \"(?P.+?)\" \"(?P.+?)\"" for line in sys.stdin: m = re.match(RE_LOG_ENTRY, line) if not m: sys.stderr.write("Ei suutnud parsida rida: %s" % line) continue # Ignoreeri localhostist ja sisevõrgust pärinevaid päringuid (a'la nagios) if m.group("remote_addr").startswith("127.") or m.group("remote_addr").startswith("192.168."): continue # Jäta vahele OPTIONS päringud if m.group("verb") not in ("GET", "POST"): continue # Jäta vahele botid ja otsingumootorid if re.search("(crawler|spider|Nuhk|Googlebot|yahoo|yandex)", m.group("agent")): continue dt = datetime.strptime(m.group("timestamp"), "%d/%b/%Y:%H:%M:%S") hits[m.group("remote_addr")] += 1 urls[m.group("path")] += 1 agents[m.group("agent")] += 1 print "Top 5 enim külastatud URL-i veebiserveris:" for path, count in urls.most_common(5): sys.stdout.write("% 9d %s\n" % (count, path)) print print "Top 5 enim külastusi teinud IP aadressid:" for remote_addr, count in hits.most_common(5): sys.stdout.write("% 9d %s\n" % (count, remote_addr)) print print "Top 5 enim kasutatud veebilehitsejad/OS-id:" for user_agent, count in agents.most_common(5): sys.stdout.write("% 9d %s\n" % (count, user_agent))