使用IP代理池和用户代理池爬取糗事百科文章

2024-12-06 技术教程

简单使用IP代理池和用户代理池的爬虫

import reimport randomimport urllib.request as urlreqimport urllib.error as urlerr#用户代理池uapools = [ "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"]#ip代理池ipools = []#获取用户代理def get_ua(uapools): thisua = random.choice(uapools) header = ("User-Agent", thisua) url_opener = urlreq.build_opener() url_opener.addheaders = [header] urlreq.install_opener(url_opener)#获取ip池，这里从西刺获取首页IP保存到列表中def get_ipools(ipurl): get_ua(uapools) data = urlreq.urlopen(ipurl).read().decode("utf-8","ignore") pat = "/></td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>" ret = re.compile(pat, re.S).findall(data) # print(ret) for i in ret: ips = i[0] + ":" + i[1] ipools.append(ips) return ipools#解析糗事百科的文章def get_article(data): pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>' rst = re.compile(pat, re.S).findall(data) print(rst) # down_file(rst, i)def get_html(urlweb): for i in range(1, 6): #爬取前五页文章 while 1: try: page = urlweb + str(i) thisua = random.choice(uapools) header = ("User-Agent", thisua) #构建用户代理 ip = random.choice(ipools) print("当前使用的ip为" + ip) proxy = urlreq.ProxyHandler({"http": ip}) #构建IP代理 url_opener = urlreq.build_opener(proxy, urlreq.HTTPHandler) #添加IP代理头 url_opener.addheaders = [header] #添加用户代理头 urlreq.install_opener(url_opener) #设为全局变量 data = urlreq.urlopen(page).read().decode("utf-8","ignore") except Exception as e: print(e) ipools.remove(ip) #爬取失败时，从IP池中删除IP，重新爬取文章 continue get_article(data) #解析文章 break #完成一页的爬取if __name__ == "__main__": ipurl = "https://www.xicidaili.com/nn/" ipools = get_ipools(ipurl) #获取ip池 urlweb = "https://www.qiushibaike.com/text/page/" get_html(urlweb)