#!/usr/bin/env python ''' logazier.py - v0.0.1 Tested with nginx log file. should work with apache also. Before using make sure to adjust the "filename" and "trusted" variables. Other than standard Python libararies, PyDNS is also needed. PyDNS can be found at : http://pydns.sourceforge.net/ or installed by running : easy_install pydns Copyright (C) 2009 Sajal Kayan - sajal at thaindian.com This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ''' import os import commands import DNS import re #Note IP must be first field in access log.. or youll need to adjust awk command below filename = "/path/to/access.log" # path to your apache logs (relative or absolute). trusted = 'trusted.txt' # Path to store trusted IPs (relative or absolute) def revlookup(ip): try: host = DNS.revlookup(ip) return host except: return "err" def lookup(host): try: ip = DNS.DnsRequest(qtype='A').req(host).answers[0]['data'] return ip except: return "err" # Remove 'ionice -c3 nice -n15 ' if u dont care about hogging all resources .... command = "ionice -c3 nice -n15 grep Googlebot " + filename #Read trusted ip list so we may ignore them #os.system("touch " + trusted) if os.path.exists(trusted): f = open(trusted) while 1: line = f.readline() if not line: break # print line #process(line) command += "| grep -v " + re.sub("\n", "", line) f.close() else: open(trusted, 'w').close() command += ' | awk \'{ print $1 }\' ' #print command ips = commands.getstatusoutput(command) ips = ips[1].split("\n") uniqueips = set(ips) ips = sorted([(ips.count(ip), ip, revlookup(ip)) for ip in uniqueips]) ips.reverse() for count, ip, host in ips: if host[-13:] == "googlebot.com" and lookup(host) == ip : print str(count) + " - " + ip + " - " + host + " - TRUSTED" #add to cache of trusted text_file = open(trusted, "a") text_file.write(ip + "\n") text_file.close() else: print str(count) + " - " + ip + " - " + host + " - FAKE - " + lookup(host)