如何用Python爬虫代理增加网站流量
这篇文章主要介绍如何用Python爬虫代理增加网站流量,文中示例代码介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们一定要看完!
获得了免费的代理列表,那么就有很多事情可以干,比如 , 爬取某个网站并且没有被封IP的风险, 比如, 增加某网站的流量。
完整代码:
#coding:utf-8importurllib2importurllibimportcookielibimporthashlibimportreimporttimeimportjsonimportunittestfromseleniumimportwebdriverfrombs4importBeautifulSoupfrompip._vendor.distlib._backport.tarfileimportTUREADfromtimeimportsleepfromseleniumimportwebdriverfromselenium.webdriver.common.keysimportKeysfromselenium.webdriver.common.desired_capabilitiesimportDesiredCapabilitiesimportrandomclassSpide:def__init__(self,proxy_ip,proxy_type,proxy_port,use_proxy=False):print'usingtheproxyinfo:',proxy_ipself.proxy_ip=proxy_ipself.proxy_type=proxy_typeself.proxy_port=proxy_portself.proxy=urllib2.ProxyHandler({proxy_type:proxy_ip+":"+proxy_port})self.usercode=""self.userid=""self.cj=cookielib.LWPCookieJar();self.opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj));ifuse_proxy:self.opener=urllib2.build_opener(self.proxy)urllib2.install_opener(self.opener);defadd_view(self):print'--->startaddingview'print'--->proxyinfo',self.proxy_ipservice_args=['--proxy='+self.proxy_ip+':'+self.proxy_port,'--proxy-type='+self.proxy_type,]dcap=dict(DesiredCapabilities.PHANTOMJS)dcap["phantomjs.page.settings.userAgent"]=("Mozilla/5.0(X11;Linuxx86_64)AppleWebKit/53""(KHTML,likeGecko)Chrome/15.0.87")driver=webdriver.PhantomJS(executable_path='/home/bin/phantomjs',service_args=service_args,desired_capabilities=dcap)driver.set_page_load_timeout(90)driver.get("http://www.503error.com/")soup=BeautifulSoup(driver.page_source,'xml')titles=soup.find_all('h2',{'class':'entry-title'})ranCount=random.randint(0,len(titles))print'randomfindalinkofthewebsitetoaccess,randomis:',ranCountrandomlink=titles[ranCount].a.attrs['href']driver.get(randomlink)driver.close()print'finishonce'defget_proxy(self):proxy_info_json=""#firstgettheproxyinfofromprint'-->usingtheip'+self.proxy_ip+'togettheproxyinfo'try:reqRequest_proxy=urllib2.Request('url2');reqRequest_proxy.add_header('Accept','*/*');reqRequest_proxy.add_header('Accept-Language','zh-CN,zh;q=0.8');reqRequest_proxy.add_header('User-Agent','Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/28.0.1500.72Safari/537.36');reqRequest_proxy.add_header('Content-Type','application/x-www-form-urlencoded');proxy_info=urllib2.urlopen(reqRequest_proxy).read();printproxy_infoproxy_info_json=json.loads(proxy_info)return_str=proxy_info_json['protocol']+":"+proxy_info_json['ip']+proxy_info_json['port']exceptException,e:print'proxyhaveproblem'#printproxy_info_json['protocol']#printproxy_info_json['ip']#printproxy_info_json['port']returnproxy_info_json#printproxy_infodefget_proxys100(self):proxy_info_json=""#firstgettheproxyinfofromprint'-->usingtheip'+self.proxy_ip+'togettheproxyinfo100'try:reqRequest_proxy=urllib2.Request('url1');reqRequest_proxy.add_header('Accept','*/*');reqRequest_proxy.add_header('Accept-Language','zh-CN,zh;q=0.8');reqRequest_proxy.add_header('User-Agent','Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/28.0.1500.72Safari/537.36');reqRequest_proxy.add_header('Content-Type','application/x-www-form-urlencoded');proxy_info=urllib2.urlopen(reqRequest_proxy).read();#printproxy_infoproxy_info_json=json.loads(proxy_info)#forporxy_iinproxy_info_json:#printporxy_i#return_str=proxy_info_json['protocol']+":"+proxy_info_json['ip']+proxy_info_json['port']returnproxy_info_jsonexceptException,e:print'proxyhaveproblem'if__name__=="__main__":#firstimegettheproxyprint'STARTADDINGVIEW:'print'GetingthenewproxyinfoFirsttime'print'---------------------------------------------------------------------------------------------------------'forcountinrange(1):test=Spide(proxy_ip='youproxyip',proxy_type='http',proxy_port='3128',use_proxy=False)proxy_list=test.get_proxy()print'->thisisthe:',countprint'->Getingthenewproxyinfo:'print'->usingtheproxytogetproxylistincaseforbiden'print'->proxyinfo',proxy_listproxy100=test.get_proxys100()forproxy1inproxy100:try:print'proxy1:',proxy1Spide1=Spide(proxy_ip=proxy1['ip'],proxy_type=proxy1['type'],proxy_port=proxy1['port'],use_proxy=True)print'beforeaddview'Spide1.add_view()print'->sleep15s'time.sleep(15)#sleeprandomtimetoranTime=random.randint(10,50)print'->sleeprandomtime:',ranTimetime.sleep(ranTime)print'->gettingnewproxy'#proxy_list=Spide1.get_proxy()exceptException,e:print'->somethingwrong,hahah,next'
一点小的注释:
整个流程为: 1 获取代理 ->2 访问首页 —>3 获取首页博客列表,随机访问->4随机等待N秒 ->返回第1步
1:你需要更改youproxyip为你一个你已经拥有的代理ip,或者,不用填写,因为后边的use_proxy=False, 这个时候你确保你能够不适用代理访问到代码中的两个自动抓取代理ip地址的网站
2:/home/bin/phantomjs 这个路径是你安装的phantomjs路径
3:代码中有两个获取代理的方法,例子中选择了一个(不要喷我下边的循环为什么是一次还要循环,因为这个版本是原来是有外层循环的)
4: 获取免费代理地址就不写了,url1 ,url2 为隐藏的获取免费代理的网站
以上是如何用Python爬虫代理增加网站流量的所有内容,感谢各位的阅读!希望分享的内容对大家有帮助,更多相关知识,欢迎关注亿速云行业资讯频道!
声明:本站所有文章资源内容,如无特殊说明或标注,均为采集网络资源。如若本站内容侵犯了原著者的合法权益,可联系本站删除。