#!/usr/bin/python import BaseHTTPServer import httplib import md5 import urllib import urllib2 import urlparse import os import SocketServer import sys import threading import time __doc__ = '''This program is threaded proxy server with cache control and hammering protection. - for example, threaded crawler program cause hammering easily. - then, this proxy server make wait automatically when it detected too frequently request for indivisual host ( ex. 10 request / sec ) - also, you can customize cache expire time via HTTPCache.expired() method. Usage: in command line: $ python proxy.py Serving HTTP on 0.0.0.0 port 8000 ... # then, what you have to do next is.. os.environ['http_proxy'] = 'http://localhost:8000' # after this set, all internet connections from your program will start to # access via proxy server. ''' __author__ = 'Yusuke Yanbe' __date__ = '10-6-2006' __version__ = '0.9.0' __license__ = 'GPL' __copyright__ = 'Copyright (c) Yusuke Yanbe, 2006' #__all__ = ['HTTP'] class HTTPCache(dict): cache_root = '/tmp/http_cache' expire = sys.maxint # 60*60*24 # a day def filename(self, path): #return self.dirname(path)+os.sep+md5.md5(path).hexdigest() return self.dirname(path)+os.sep+urllib.quote(path,safe='') def dirname(self, path): return self.cache_root+os.sep+urlparse.urlparse(path)[1] def expired(self, path): return time.time() - os.stat(self.filename(path))[8] > self.expire def __getitem__(self, path): return file(self.filename(path), 'r').read() def __setitem__(self, path, data): _dir = self.dirname(path) if not os.access(_dir, os.F_OK): os.makedirs(_dir) file(self.filename(path), 'w').write(data) def __delitem__(self, path): os.remove(self.filename(path)) def __contains__(self, path): return os.access(self.filename(path), os.F_OK) and \ not self.expired(path) cache = HTTPCache() class ProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler): unique_host_wait = 1 lock = threading.Lock() def do_GET(self): if self.path in cache: self.response_cache() else: self.response_remote_data() def response_cache(self): self.send_response(200,'OK') self._send_required_headers() self.wfile.write(cache[self.path]) def response_remote_data(self): error_count = 0 while 1: ProxyHandler.lock.acquire() interval = time.time() - self._last_access_to_host(self.path) if interval < self.unique_host_wait: ProxyHandler.lock.release() time.sleep(self.unique_host_wait) else: try: f = urllib2.urlopen(self.path) if f.code != 200: print 'HTTP Error:', f.code raise urllib2.HTTPError cache[self.path] = '' # touch ProxyHandler.lock.release() self.send_response(f.code, f.msg) data = f.read() cache[self.path] = data self._send_required_headers() except urllib2.HTTPError, e: ProxyHandler.lock.release() self.send_error(e.code, e.msg) self.end_headers() data = e.read() except (ValueError,httplib.IncompleteRead), e: ProxyHandler.lock.release() error_count += 1 if error_count == 3: self.send_error(500, 'Internal Server Error') self.end_headers() data = '' self.wfile.write(data) break def _send_required_headers(self): self.send_header('server', self.version_string()) self.send_header('date', self.date_time_string()) self.end_headers() def _last_access_to_host(self, path): _dir = cache.dirname(path) return os.access(_dir, os.F_OK) and os.stat(_dir)[8] or -1 class ThreadingHTTPServer (SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer): pass if __name__ == '__main__': BaseHTTPServer.test(ProxyHandler, ThreadingHTTPServer)