Код:
#!/usr/bin/python # -*- coding: utf-8 -*- import fpclib import time import thread import time import sys if len(sys.argv) < 2: print "%s [N]" % (sys.argv[0]) sys.exit() N = int(sys.argv[1]) proxies = [] keywords = u"socks proxy" proxy_types = ("http","socks4", "socks5") TIMEOUT_GET = 10 # сколько секунд ждать ответа от Google TIMEOUT_GOOGLE = 2.0 # сколько ждать после обработки всех сайтов(ip), перед следующим запросом к Google TIMEOUT_CHECK = 15 # сколько ждать прокси-сервер (ответ от него) TIMEOUT_WHILE = 1.0 # таймаут в цикле ожидания выхода (пока все "висящие" ip не обработаются) N_T = 50 n_t = 0 def func(proxy_hostname, proxy_port, timeout): global n_t global proxies for proxy_type in proxy_types: if fpclib.check_proxy(proxy_hostname, proxy_port, proxy_type, timeout): s = "%s:%d" % (proxy_hostname, proxy_port) print "ACCEPT %s %s" % (proxy_type, s) if not s in proxies: proxies.append((proxy_type, s)) break; n_t = n_t - 1 i = 0 while True: url = fpclib.geturl_google_text(keywords,i) html = fpclib.gethttp_pycurl(url,TIMEOUT_GET) sites = fpclib.cuturi_google(html) print sites for url_site in sites: html = fpclib.gethttp_pycurl(url_site,TIMEOUT_GET) addrs = fpclib.cutipport(html) print addrs for addr in addrs: ip, proxy_port = addr.split(":") proxy_port = int(proxy_port) while n_t > N_T: time.sleep(TIMEOUT_WHILE) n_t = n_t + 1 thread.start_new(func, (ip, proxy_port, TIMEOUT_CHECK,)) if len(proxies) >= N: break if len(proxies) >= N: break if len(proxies) >= N: break i = i + 1 time.sleep(TIMEOUT_GOOGLE) # seconds j = 0 MAX_WHILE = 120 while n_t > 0: time.sleep(TIMEOUT_WHILE) j = j + 1 if j > MAX_WHILE: break; if j > MAX_WHILE: print "BREAK" else: print "QUIT" print "---" for proxy in proxies: print proxy[0],proxy[1] ---- fpclib.py: #!/usr/bin/python # -*- coding: utf-8 -*- from StringIO import StringIO import pycurl import re def gethttp_pycurl_f(url,f,timeout=1,proxy_type="none",proxy_hostname="localhost",proxy_port=3128): curl = pycurl.Curl() curl.setopt(pycurl.URL, url) curl.setopt(pycurl.WRITEDATA, f) curl.setopt(pycurl.FOLLOWLOCATION, 1) curl.setopt(pycurl.MAXREDIRS, 5) curl.setopt(pycurl.CONNECTTIMEOUT, timeout) curl.setopt(pycurl.TIMEOUT, timeout) curl.setopt(pycurl.NOSIGNAL, 1) if proxy_type == "http": curl.setopt(pycurl.PROXY,proxy_hostname) curl.setopt(pycurl.PROXYPORT,proxy_port) curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_HTTP) elif proxy_type == "socks4": curl.setopt(pycurl.PROXY,proxy_hostname) curl.setopt(pycurl.PROXYPORT,proxy_port) curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_SOCKS4) elif proxy_type == "socks5": curl.setopt(pycurl.PROXY,proxy_hostname) curl.setopt(pycurl.PROXYPORT,proxy_port) curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_SOCKS5) try: curl.perform() except Exception,e: print "Exception: %s" % str(e) curl.close() def gethttp_pycurl(url,timeout=1,proxy_type="none",proxy_hostname="localhost",proxy_port=3128): body = StringIO() curl = pycurl.Curl() curl.setopt(pycurl.URL, url) curl.setopt(pycurl.WRITEFUNCTION, body.write) curl.setopt(pycurl.FOLLOWLOCATION, 1) curl.setopt(pycurl.MAXREDIRS, 5) curl.setopt(pycurl.CONNECTTIMEOUT, timeout) curl.setopt(pycurl.TIMEOUT, timeout) curl.setopt(pycurl.NOSIGNAL, 1) if proxy_type == "http": curl.setopt(pycurl.PROXY,proxy_hostname) curl.setopt(pycurl.PROXYPORT,proxy_port) curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_HTTP) elif proxy_type == "socks4": curl.setopt(pycurl.PROXY,proxy_hostname) curl.setopt(pycurl.PROXYPORT,proxy_port) curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_SOCKS4) elif proxy_type == "socks5": curl.setopt(pycurl.PROXY,proxy_hostname) curl.setopt(pycurl.PROXYPORT,proxy_port) curl.setopt(pycurl.PROXYTYPE,pycurl.PROXYTYPE_SOCKS5) try: curl.perform() except Exception,e: print "Exception: %s" % str(e) page = body.getvalue() curl.close() return page import urllib def geturl_rambler_text(text,i): site = "http://www.rambler.ru/lite" url = urllib.urlencode([ ("oe","1251"), ("words",text.encode("cp1251")), ("start",str(int(i)*10+1))]) return site+"?"+url def geturl_rambler_images(text,i): site = "http://nova.rambler.ru/pictures" url = urllib.urlencode([ ("query",text.encode("utf-8"))]) return site+"?"+url def geturl_yandex_text(text,i): site = "http://yandex.ru/yandsearch" url = urllib.urlencode([ ("rpt","rad"), ("text",text.encode("utf-8")), ("p",str(int(i)))]) return site+"?"+url def geturl_yandex_images(text,i): site = "http://images.yandex.ru/yandsearch" url = urllib.urlencode([ ("stype","image"), ("text",text.encode("utf-8"))]) # utf-8 return site+"?"+url def geturl_google_text(text,i): site = "http://www.google.ru/search" url = urllib.urlencode([ ("hl","ru"), ("q",text.encode("utf-8")), ("start",str(int(i)*10))]) return site+"?"+url def geturl_google_images(text,i): site = "http://images.google.ru/images" url = urllib.urlencode([ ("hl","ru"), ("gbv","2"), ("btnG","%D0%9F%D0%BE%D0%B8%D1%81%D0%BA+%D0%BA%D0%B0%D1%80%D1%82%D0%B8%D0%BD%D0%BE%D0%BA"), ("q",text.encode("utf-8"))]) return site+"?"+url def cuturi(text): a = re.findall("((?:http://|https://|ftp://|gopher://|mailto:|xmpp:)(?:[\w\.]+:\d+)?(?:[^\"\'\t\n\r< >:]+))",text) return a def cuturi_google(text): a = re.findall("((?:http://|https://|ftp://|gopher://|mailto:|xmpp:)(?:[\w\.]+:\d+)?(?:[^\"\'\t\n\r< >:]+\" target))",text) text = ' '.join(a) return cuturi(text) def cutip(text): a = re.findall("(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})",text) return a def cutipport(text): a = re.findall("(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+)",text) return a def check_proxy(proxy_hostname, proxy_port, proxy_type="http", timeout = 30): url= "http://ya.ru/" html = gethttp_pycurl(url,timeout,proxy_type,proxy_hostname,proxy_port) if re.search("http://yandex.ru/", html) <> None: res = True else: res = False return res