文章转载自:
代理服务器:
1 #coding: utf-8 2 3 import urllib2 4 import re 5 import time 6 import threading 7 import MySQLdb 8 9 rawProxyList = [] 10 checkedProxyList = [] 11 12 #抓取代理网站 13 targets = [] 14 for i in xrange(1, 23): 15 target = r"http://www.proxy.com.ru/list_%d.html" % i 16 targets.append(target) 17 #print target + "\n" 18 19 #抓取代理服务器正则 20 p = re.compile(r'''(\d+)(.+?)(\d+)(.+?)(.+?)''') 21 22 #获取代理的类 23 24 class ProxyGet(threading.Thread): 25 def __init__(self, target): 26 threading.Thread.__init__(self) 27 self.target = target 28 29 30 def getProxy(self): 31 req = urllib2.Request(self.target) 32 respnse = urllib2.urlopen(req) 33 result = respnse.read() 34 matches = p.findall(result) 35 #print matches 36 for row in matches: 37 ip = row[1] 38 port = row[2] 39 addr = row[4].decode("cp936").encode("utf-8") 40 proxy = [ip, port, addr] 41 #print proxy 42 rawProxyList.append(proxy) 43 44 45 def run(self): 46 self.getProxy() 47 48 #核对代理是否有效的类 49 class ProxyCheck(threading.Thread): 50 def __init__(self,proxyList): 51 threading.Thread.__init__(self) 52 self.proxyList = proxyList 53 self.timeout = 5 54 self.testUrl = "http://www.baidu.com/" 55 self.testStr = "030173" 56 57 def checkProxy(self): 58 cookies = urllib2.HTTPCookieProcessor() 59 for proxy in self.proxyList: 60 proxyHandler = urllib2.ProxyHandler({ "http": r'http://%s:%s' %(proxy[0], proxy[1])}) 61 #print r'http://%s:%s' %(proxy[0],proxy[1]) 62 opener = urllib2.build_opener(cookies, proxyHandler) 63 opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')] 64 #urllib2.install_opener(opener) 65 t1 = time.time() 66 67 try: 68 #req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout) 69 req = opener.open(self.testUrl, timeout=self.timeout) 70 #print "urlopen is ok...." 71 result = req.read() 72 #print "read html...." 73 timeused = time.time() - t1 74 pos = result.find(self.testStr) 75 #print "pos is %s" %pos 76 77 if pos >= 1: 78 checkedProxyList.append((proxy[0], proxy[1], proxy[2], timeused)) 79 print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused) 80 else: 81 continue 82 except Exception, e: 83 #print e.message 84 continue 85 86 def run(self): 87 self.checkProxy() 88 89 90 if __name__ == "__main__": 91 getThreads = [] 92 checkThreads = [] 93 94 #对每个目标网站开启一个线程负责抓取代理 95 for i in range(len(targets)): 96 t = ProxyGet(targets[i]) 97 getThreads.append(t) 98 99 for i in range(len(getThreads)):100 getThreads[i].start()101 102 for i in range(len(getThreads)):103 getThreads[i].join()104 105 print '.'*10 + "总共抓取了%s个代理" % len(rawProxyList) + '.'*10106 107 #开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份108 for i in range(20):109 t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])110 checkThreads.append(t)111 112 for i in range(len(checkThreads)):113 checkThreads[i].start()114 115 for i in range(len(checkThreads)):116 checkThreads[i].join()117 118 print '.'*10 + "总共抓取了%s个代理" % len(checkedProxyList) + '.'*10119 120 #插入数据库,四个字段ip, port, speed, addr121 def db_insert(insert_list):122 try:123 conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="meimei1118", db="ctdata", charset='utf8')124 cursor = conn.cursor()125 cursor.execute('delete from proxy')126 cursor.execute('alter table proxy AUTO_INCREMENT=1')127 cursor.executemany("INSERT INTO proxy(ip,port,speed,address) VALUES(%s, %s, %s,%s)", insert_list)128 conn.commit()129 cursor.close()130 conn.close()131 132 except MySQLdb.Error, e:133 print "Mysql Error %d: %s" %(e.args[0], e.args[1])134 135 #代理排序持久化136 proxy_ok = []137 for proxy in sorted(checkedProxyList, cmp=lambda x, y: cmp(x[3], y[3])):138 if proxy[3] < 8:139 #print "checked proxy is: %s:%s\t%s\t%s" %(proxy[0],proxy[1],proxy[2],proxy[3])140 proxy_ok.append((proxy[0], proxy[1], proxy[3], proxy[2]))141 142 db_insert(proxy_ok)