博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python多线程抓取代理服务器
阅读量:6207 次
发布时间:2019-06-21

本文共 4666 字,大约阅读时间需要 15 分钟。

文章转载自:

 代理服务器:

 

1 #coding: utf-8  2   3 import urllib2  4 import re  5 import time  6 import threading  7 import MySQLdb  8   9 rawProxyList = [] 10 checkedProxyList = [] 11  12 #抓取代理网站 13 targets = [] 14 for i in xrange(1, 23): 15     target = r"http://www.proxy.com.ru/list_%d.html" % i 16     targets.append(target) 17     #print target + "\n" 18  19 #抓取代理服务器正则 20 p = re.compile(r'''(\d+)(.+?)(\d+)(.+?)(.+?)''') 21  22 #获取代理的类 23  24 class ProxyGet(threading.Thread): 25     def __init__(self, target): 26         threading.Thread.__init__(self) 27         self.target = target 28  29  30     def getProxy(self): 31         req = urllib2.Request(self.target) 32         respnse = urllib2.urlopen(req) 33         result = respnse.read() 34         matches = p.findall(result) 35         #print matches 36         for row in matches: 37             ip = row[1] 38             port = row[2] 39             addr = row[4].decode("cp936").encode("utf-8") 40             proxy = [ip, port, addr] 41             #print proxy 42             rawProxyList.append(proxy) 43  44  45     def run(self): 46         self.getProxy() 47  48 #核对代理是否有效的类 49 class ProxyCheck(threading.Thread): 50     def __init__(self,proxyList): 51         threading.Thread.__init__(self) 52         self.proxyList = proxyList 53         self.timeout = 5 54         self.testUrl = "http://www.baidu.com/" 55         self.testStr = "030173" 56  57     def checkProxy(self): 58         cookies = urllib2.HTTPCookieProcessor() 59         for proxy in self.proxyList: 60             proxyHandler = urllib2.ProxyHandler({
"http": r'http://%s:%s' %(proxy[0], proxy[1])}) 61 #print r'http://%s:%s' %(proxy[0],proxy[1]) 62 opener = urllib2.build_opener(cookies, proxyHandler) 63 opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')] 64 #urllib2.install_opener(opener) 65 t1 = time.time() 66 67 try: 68 #req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout) 69 req = opener.open(self.testUrl, timeout=self.timeout) 70 #print "urlopen is ok...." 71 result = req.read() 72 #print "read html...." 73 timeused = time.time() - t1 74 pos = result.find(self.testStr) 75 #print "pos is %s" %pos 76 77 if pos >= 1: 78 checkedProxyList.append((proxy[0], proxy[1], proxy[2], timeused)) 79 print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused) 80 else: 81 continue 82 except Exception, e: 83 #print e.message 84 continue 85 86 def run(self): 87 self.checkProxy() 88 89 90 if __name__ == "__main__": 91 getThreads = [] 92 checkThreads = [] 93 94 #对每个目标网站开启一个线程负责抓取代理 95 for i in range(len(targets)): 96 t = ProxyGet(targets[i]) 97 getThreads.append(t) 98 99 for i in range(len(getThreads)):100 getThreads[i].start()101 102 for i in range(len(getThreads)):103 getThreads[i].join()104 105 print '.'*10 + "总共抓取了%s个代理" % len(rawProxyList) + '.'*10106 107 #开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份108 for i in range(20):109 t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])110 checkThreads.append(t)111 112 for i in range(len(checkThreads)):113 checkThreads[i].start()114 115 for i in range(len(checkThreads)):116 checkThreads[i].join()117 118 print '.'*10 + "总共抓取了%s个代理" % len(checkedProxyList) + '.'*10119 120 #插入数据库,四个字段ip, port, speed, addr121 def db_insert(insert_list):122 try:123 conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="meimei1118", db="ctdata", charset='utf8')124 cursor = conn.cursor()125 cursor.execute('delete from proxy')126 cursor.execute('alter table proxy AUTO_INCREMENT=1')127 cursor.executemany("INSERT INTO proxy(ip,port,speed,address) VALUES(%s, %s, %s,%s)", insert_list)128 conn.commit()129 cursor.close()130 conn.close()131 132 except MySQLdb.Error, e:133 print "Mysql Error %d: %s" %(e.args[0], e.args[1])134 135 #代理排序持久化136 proxy_ok = []137 for proxy in sorted(checkedProxyList, cmp=lambda x, y: cmp(x[3], y[3])):138 if proxy[3] < 8:139 #print "checked proxy is: %s:%s\t%s\t%s" %(proxy[0],proxy[1],proxy[2],proxy[3])140 proxy_ok.append((proxy[0], proxy[1], proxy[3], proxy[2]))141 142 db_insert(proxy_ok)

 

转载于:https://www.cnblogs.com/nju2014/p/4614698.html

你可能感兴趣的文章
MyBatis之输入与输出(resultType、resultMap)映射
查看>>
剥开比原看代码09:通过dashboard创建密钥时,前端的数据是如何传到后端的?
查看>>
51、YUM安装配置LAMP、phpMyAdmin实战
查看>>
Yeslab现任明教教主ISE课程前七部分免费发布
查看>>
linux下恢复误删文件
查看>>
Universal-Image-Loader,android-Volley,Picasso、Fresco和Glide开源组件加载网络图片的优缺点比较...
查看>>
RAID的肤浅认识
查看>>
poxtfix+dovecot+saslauthd+courier-authlib +mysql + extmail 完整虚拟邮箱系统部署
查看>>
《C语言深度剖析》学习笔记三
查看>>
Erlang并发机制 –进程调度
查看>>
XEN--转载自鸟哥的linux私房菜
查看>>
《在你身边,为你设计》-哪位知道下载、在线阅读地址啊?
查看>>
WAS 报错 Font '宋体' is not available to the JVM
查看>>
Windows更新补丁下载、批量安装的几种方法
查看>>
Petapoco使用SQLite的异常问题
查看>>
redhat6.4 安装oracle 10g error
查看>>
关闭子窗口 父窗口自动刷新
查看>>
简单了解tengine
查看>>
ln链接使用
查看>>
存档:老系统 WINDOWS 95 98 XP NT系统号
查看>>